From 729afe2db65e7f53555171b4f632a7bb5aa2d73c Mon Sep 17 00:00:00 2001 From: Ho Ngoc Hai Date: Sat, 18 Apr 2026 15:27:30 +0700 Subject: [PATCH] feat(ai-services): dedicated GET /avm/v2/feature-importance endpoint (TEC-2760) Exposes ensemble feature importance as a standalone endpoint per R5.1 spec. Aggregates XGBoost (0.4) + LightGBM (0.35) + CatBoost (0.25) gain when trained boosters are loaded; falls back to the curated heuristic ranking otherwise, so callers can depend on the endpoint during scaffold/heuristic-only runs. - Factored heuristic drivers into a shared constant (_HEURISTIC_DRIVERS) - Added AVMv2FeatureImportanceResponse model (model_version + source + drivers) - Added service.get_feature_importance() public method - Added tests/test_avm_v2.py::test_feature_importance_heuristic (24 total pass) Co-Authored-By: Paperclip --- libs/ai-services/app/models/avm_v2.py | 13 ++++ libs/ai-services/app/routers/avm_v2.py | 12 ++++ .../app/services/avm_v2_service.py | 62 ++++++++++++++----- libs/ai-services/tests/test_avm_v2.py | 21 +++++++ 4 files changed, 92 insertions(+), 16 deletions(-) diff --git a/libs/ai-services/app/models/avm_v2.py b/libs/ai-services/app/models/avm_v2.py index a4e57fd..af9d378 100644 --- a/libs/ai-services/app/models/avm_v2.py +++ b/libs/ai-services/app/models/avm_v2.py @@ -213,6 +213,19 @@ class AVMv2RollbackRequest(BaseModel): target_version: str = Field(..., min_length=1, description="Model version to roll back to") +class AVMv2FeatureImportanceResponse(BaseModel): + """Global feature importance across the loaded ensemble. + + `source` is `"model"` when importances come from the trained boosters + (weighted XGBoost gain + LightGBM gain + CatBoost importance), or + `"heuristic"` when the service is running without trained artifacts. + """ + + model_version: str + source: str = Field(..., description="One of: model, heuristic") + drivers: list[AVMv2FeatureImportance] = Field(default_factory=list) + + class AVMv1Summary(BaseModel): """Compact summary of a v1 prediction for comparison.""" diff --git a/libs/ai-services/app/routers/avm_v2.py b/libs/ai-services/app/routers/avm_v2.py index afa50ae..196e9c7 100644 --- a/libs/ai-services/app/routers/avm_v2.py +++ b/libs/ai-services/app/routers/avm_v2.py @@ -5,6 +5,7 @@ from fastapi import APIRouter, HTTPException from app.models.avm_v2 import ( ABComparisonRequest, ABComparisonResponse, + AVMv2FeatureImportanceResponse, AVMv2ModelInfo, AVMv2PredictRequest, AVMv2PredictResponse, @@ -54,6 +55,17 @@ def model_info_v2() -> AVMv2ModelInfo: return avm_v2_service.get_model_info() +@router.get("/feature-importance", response_model=AVMv2FeatureImportanceResponse) +def feature_importance_v2() -> AVMv2FeatureImportanceResponse: + """Global feature importance for the active ensemble. + + Aggregates XGBoost gain (0.4) + LightGBM gain (0.35) + CatBoost importance (0.25) + when trained boosters are loaded. Falls back to a curated heuristic ranking when + the service is running without artifacts. + """ + return avm_v2_service.get_feature_importance() + + @router.get("/versions", response_model=list[AVMv2ModelInfo]) def list_versions() -> list[AVMv2ModelInfo]: """List all registered model versions with their metrics and status.""" diff --git a/libs/ai-services/app/services/avm_v2_service.py b/libs/ai-services/app/services/avm_v2_service.py index 9e8041d..6fbd7bc 100644 --- a/libs/ai-services/app/services/avm_v2_service.py +++ b/libs/ai-services/app/services/avm_v2_service.py @@ -22,6 +22,7 @@ from app.models.avm_v2 import ( AVMv1Summary, AVMv2Comparable, AVMv2FeatureImportance, + AVMv2FeatureImportanceResponse, AVMv2ModelInfo, AVMv2PredictRequest, AVMv2PredictResponse, @@ -121,6 +122,30 @@ CITY_BASELINE: dict[str, float] = { } DEFAULT_BASELINE = 30.0 +# ── Heuristic feature importance ──────────────────────────────── +# Used both inside heuristic predict responses and by the dedicated +# feature-importance endpoint when no trained booster is loaded. +_HEURISTIC_DRIVERS: list[tuple[str, float]] = [ + ("area_m2", 0.14), + ("avg_price_district_3m_vnd_m2", 0.12), + ("neighborhood_score", 0.10), + ("property_type_encoded", 0.10), + ("distance_to_cbd_km", 0.08), + ("developer_reputation", 0.07), + ("renovation_score", 0.07), + ("building_age_years", 0.06), + ("direction_encoded", 0.05), + ("floor_level", 0.05), + ("has_legal_paper", 0.05), + ("distance_to_metro_km", 0.04), + ("interior_quality", 0.04), + ("price_momentum_30d", 0.03), +] + + +def _heuristic_drivers() -> list[AVMv2FeatureImportance]: + return [AVMv2FeatureImportance(feature=f, importance=w) for f, w in _HEURISTIC_DRIVERS] + def _encode_features(req: AVMv2PredictRequest) -> np.ndarray: """Encode a prediction request into a feature vector.""" @@ -468,22 +493,7 @@ class AVMv2EnsembleService: confidence = max(0.0, min(1.0, 1.0 - cv)) # Heuristic driver ranking - drivers = [ - AVMv2FeatureImportance(feature="area_m2", importance=0.14), - AVMv2FeatureImportance(feature="avg_price_district_3m_vnd_m2", importance=0.12), - AVMv2FeatureImportance(feature="neighborhood_score", importance=0.10), - AVMv2FeatureImportance(feature="property_type_encoded", importance=0.10), - AVMv2FeatureImportance(feature="distance_to_cbd_km", importance=0.08), - AVMv2FeatureImportance(feature="developer_reputation", importance=0.07), - AVMv2FeatureImportance(feature="renovation_score", importance=0.07), - AVMv2FeatureImportance(feature="building_age_years", importance=0.06), - AVMv2FeatureImportance(feature="direction_encoded", importance=0.05), - AVMv2FeatureImportance(feature="floor_level", importance=0.05), - AVMv2FeatureImportance(feature="has_legal_paper", importance=0.05), - AVMv2FeatureImportance(feature="distance_to_metro_km", importance=0.04), - AVMv2FeatureImportance(feature="interior_quality", importance=0.04), - AVMv2FeatureImportance(feature="price_momentum_30d", importance=0.03), - ] + drivers = _heuristic_drivers() return AVMv2PredictResponse( estimated_price_vnd=round(estimated, -3), @@ -544,6 +554,26 @@ class AVMv2EnsembleService: for f, v in sorted_imp ] + def get_feature_importance(self) -> AVMv2FeatureImportanceResponse: + """Return global feature importance for the active ensemble. + + Prefers trained-booster importances (weighted gain aggregation). Falls + back to a curated heuristic ranking when no boosters are loaded so the + endpoint stays available during scaffolded / heuristic-only runs. + """ + drivers = self._get_feature_importance() if self._models else [] + if drivers: + return AVMv2FeatureImportanceResponse( + model_version=self._model_version, + source="model", + drivers=drivers, + ) + return AVMv2FeatureImportanceResponse( + model_version=self._model_version, + source="heuristic", + drivers=_heuristic_drivers(), + ) + # ── Training pipeline ─────────────────────────────────────── def train(self, req: AVMv2TrainRequest) -> AVMv2TrainResponse: diff --git a/libs/ai-services/tests/test_avm_v2.py b/libs/ai-services/tests/test_avm_v2.py index d979947..6ba34ee 100644 --- a/libs/ai-services/tests/test_avm_v2.py +++ b/libs/ai-services/tests/test_avm_v2.py @@ -261,6 +261,27 @@ def test_model_info_v2(): assert data["is_active"] is True +# ── Feature importance endpoint ────────────────────────────────── + + +def test_feature_importance_heuristic(): + """Dedicated endpoint returns heuristic drivers when no models are loaded.""" + resp = client.get("/avm/v2/feature-importance") + assert resp.status_code == 200 + data = resp.json() + + assert data["source"] == "heuristic" + assert data["model_version"] == "ensemble-v2-heuristic" + drivers = data["drivers"] + assert len(drivers) > 0 + importances = [d["importance"] for d in drivers] + assert importances == sorted(importances, reverse=True) + assert all(0 <= i <= 1 for i in importances) + feature_names = {d["feature"] for d in drivers} + assert "area_m2" in feature_names + assert "neighborhood_score" in feature_names + + # ── Model versioning ────────────────────────────────────────────