feat(ai-services): dedicated GET /avm/v2/feature-importance endpoint (TEC-2760)
Exposes ensemble feature importance as a standalone endpoint per R5.1 spec. Aggregates XGBoost (0.4) + LightGBM (0.35) + CatBoost (0.25) gain when trained boosters are loaded; falls back to the curated heuristic ranking otherwise, so callers can depend on the endpoint during scaffold/heuristic-only runs. - Factored heuristic drivers into a shared constant (_HEURISTIC_DRIVERS) - Added AVMv2FeatureImportanceResponse model (model_version + source + drivers) - Added service.get_feature_importance() public method - Added tests/test_avm_v2.py::test_feature_importance_heuristic (24 total pass) Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -213,6 +213,19 @@ class AVMv2RollbackRequest(BaseModel):
|
|||||||
target_version: str = Field(..., min_length=1, description="Model version to roll back to")
|
target_version: str = Field(..., min_length=1, description="Model version to roll back to")
|
||||||
|
|
||||||
|
|
||||||
|
class AVMv2FeatureImportanceResponse(BaseModel):
|
||||||
|
"""Global feature importance across the loaded ensemble.
|
||||||
|
|
||||||
|
`source` is `"model"` when importances come from the trained boosters
|
||||||
|
(weighted XGBoost gain + LightGBM gain + CatBoost importance), or
|
||||||
|
`"heuristic"` when the service is running without trained artifacts.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_version: str
|
||||||
|
source: str = Field(..., description="One of: model, heuristic")
|
||||||
|
drivers: list[AVMv2FeatureImportance] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
class AVMv1Summary(BaseModel):
|
class AVMv1Summary(BaseModel):
|
||||||
"""Compact summary of a v1 prediction for comparison."""
|
"""Compact summary of a v1 prediction for comparison."""
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from fastapi import APIRouter, HTTPException
|
|||||||
from app.models.avm_v2 import (
|
from app.models.avm_v2 import (
|
||||||
ABComparisonRequest,
|
ABComparisonRequest,
|
||||||
ABComparisonResponse,
|
ABComparisonResponse,
|
||||||
|
AVMv2FeatureImportanceResponse,
|
||||||
AVMv2ModelInfo,
|
AVMv2ModelInfo,
|
||||||
AVMv2PredictRequest,
|
AVMv2PredictRequest,
|
||||||
AVMv2PredictResponse,
|
AVMv2PredictResponse,
|
||||||
@@ -54,6 +55,17 @@ def model_info_v2() -> AVMv2ModelInfo:
|
|||||||
return avm_v2_service.get_model_info()
|
return avm_v2_service.get_model_info()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/feature-importance", response_model=AVMv2FeatureImportanceResponse)
|
||||||
|
def feature_importance_v2() -> AVMv2FeatureImportanceResponse:
|
||||||
|
"""Global feature importance for the active ensemble.
|
||||||
|
|
||||||
|
Aggregates XGBoost gain (0.4) + LightGBM gain (0.35) + CatBoost importance (0.25)
|
||||||
|
when trained boosters are loaded. Falls back to a curated heuristic ranking when
|
||||||
|
the service is running without artifacts.
|
||||||
|
"""
|
||||||
|
return avm_v2_service.get_feature_importance()
|
||||||
|
|
||||||
|
|
||||||
@router.get("/versions", response_model=list[AVMv2ModelInfo])
|
@router.get("/versions", response_model=list[AVMv2ModelInfo])
|
||||||
def list_versions() -> list[AVMv2ModelInfo]:
|
def list_versions() -> list[AVMv2ModelInfo]:
|
||||||
"""List all registered model versions with their metrics and status."""
|
"""List all registered model versions with their metrics and status."""
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ from app.models.avm_v2 import (
|
|||||||
AVMv1Summary,
|
AVMv1Summary,
|
||||||
AVMv2Comparable,
|
AVMv2Comparable,
|
||||||
AVMv2FeatureImportance,
|
AVMv2FeatureImportance,
|
||||||
|
AVMv2FeatureImportanceResponse,
|
||||||
AVMv2ModelInfo,
|
AVMv2ModelInfo,
|
||||||
AVMv2PredictRequest,
|
AVMv2PredictRequest,
|
||||||
AVMv2PredictResponse,
|
AVMv2PredictResponse,
|
||||||
@@ -121,6 +122,30 @@ CITY_BASELINE: dict[str, float] = {
|
|||||||
}
|
}
|
||||||
DEFAULT_BASELINE = 30.0
|
DEFAULT_BASELINE = 30.0
|
||||||
|
|
||||||
|
# ── Heuristic feature importance ────────────────────────────────
|
||||||
|
# Used both inside heuristic predict responses and by the dedicated
|
||||||
|
# feature-importance endpoint when no trained booster is loaded.
|
||||||
|
_HEURISTIC_DRIVERS: list[tuple[str, float]] = [
|
||||||
|
("area_m2", 0.14),
|
||||||
|
("avg_price_district_3m_vnd_m2", 0.12),
|
||||||
|
("neighborhood_score", 0.10),
|
||||||
|
("property_type_encoded", 0.10),
|
||||||
|
("distance_to_cbd_km", 0.08),
|
||||||
|
("developer_reputation", 0.07),
|
||||||
|
("renovation_score", 0.07),
|
||||||
|
("building_age_years", 0.06),
|
||||||
|
("direction_encoded", 0.05),
|
||||||
|
("floor_level", 0.05),
|
||||||
|
("has_legal_paper", 0.05),
|
||||||
|
("distance_to_metro_km", 0.04),
|
||||||
|
("interior_quality", 0.04),
|
||||||
|
("price_momentum_30d", 0.03),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _heuristic_drivers() -> list[AVMv2FeatureImportance]:
|
||||||
|
return [AVMv2FeatureImportance(feature=f, importance=w) for f, w in _HEURISTIC_DRIVERS]
|
||||||
|
|
||||||
|
|
||||||
def _encode_features(req: AVMv2PredictRequest) -> np.ndarray:
|
def _encode_features(req: AVMv2PredictRequest) -> np.ndarray:
|
||||||
"""Encode a prediction request into a feature vector."""
|
"""Encode a prediction request into a feature vector."""
|
||||||
@@ -468,22 +493,7 @@ class AVMv2EnsembleService:
|
|||||||
confidence = max(0.0, min(1.0, 1.0 - cv))
|
confidence = max(0.0, min(1.0, 1.0 - cv))
|
||||||
|
|
||||||
# Heuristic driver ranking
|
# Heuristic driver ranking
|
||||||
drivers = [
|
drivers = _heuristic_drivers()
|
||||||
AVMv2FeatureImportance(feature="area_m2", importance=0.14),
|
|
||||||
AVMv2FeatureImportance(feature="avg_price_district_3m_vnd_m2", importance=0.12),
|
|
||||||
AVMv2FeatureImportance(feature="neighborhood_score", importance=0.10),
|
|
||||||
AVMv2FeatureImportance(feature="property_type_encoded", importance=0.10),
|
|
||||||
AVMv2FeatureImportance(feature="distance_to_cbd_km", importance=0.08),
|
|
||||||
AVMv2FeatureImportance(feature="developer_reputation", importance=0.07),
|
|
||||||
AVMv2FeatureImportance(feature="renovation_score", importance=0.07),
|
|
||||||
AVMv2FeatureImportance(feature="building_age_years", importance=0.06),
|
|
||||||
AVMv2FeatureImportance(feature="direction_encoded", importance=0.05),
|
|
||||||
AVMv2FeatureImportance(feature="floor_level", importance=0.05),
|
|
||||||
AVMv2FeatureImportance(feature="has_legal_paper", importance=0.05),
|
|
||||||
AVMv2FeatureImportance(feature="distance_to_metro_km", importance=0.04),
|
|
||||||
AVMv2FeatureImportance(feature="interior_quality", importance=0.04),
|
|
||||||
AVMv2FeatureImportance(feature="price_momentum_30d", importance=0.03),
|
|
||||||
]
|
|
||||||
|
|
||||||
return AVMv2PredictResponse(
|
return AVMv2PredictResponse(
|
||||||
estimated_price_vnd=round(estimated, -3),
|
estimated_price_vnd=round(estimated, -3),
|
||||||
@@ -544,6 +554,26 @@ class AVMv2EnsembleService:
|
|||||||
for f, v in sorted_imp
|
for f, v in sorted_imp
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_feature_importance(self) -> AVMv2FeatureImportanceResponse:
|
||||||
|
"""Return global feature importance for the active ensemble.
|
||||||
|
|
||||||
|
Prefers trained-booster importances (weighted gain aggregation). Falls
|
||||||
|
back to a curated heuristic ranking when no boosters are loaded so the
|
||||||
|
endpoint stays available during scaffolded / heuristic-only runs.
|
||||||
|
"""
|
||||||
|
drivers = self._get_feature_importance() if self._models else []
|
||||||
|
if drivers:
|
||||||
|
return AVMv2FeatureImportanceResponse(
|
||||||
|
model_version=self._model_version,
|
||||||
|
source="model",
|
||||||
|
drivers=drivers,
|
||||||
|
)
|
||||||
|
return AVMv2FeatureImportanceResponse(
|
||||||
|
model_version=self._model_version,
|
||||||
|
source="heuristic",
|
||||||
|
drivers=_heuristic_drivers(),
|
||||||
|
)
|
||||||
|
|
||||||
# ── Training pipeline ───────────────────────────────────────
|
# ── Training pipeline ───────────────────────────────────────
|
||||||
|
|
||||||
def train(self, req: AVMv2TrainRequest) -> AVMv2TrainResponse:
|
def train(self, req: AVMv2TrainRequest) -> AVMv2TrainResponse:
|
||||||
|
|||||||
@@ -261,6 +261,27 @@ def test_model_info_v2():
|
|||||||
assert data["is_active"] is True
|
assert data["is_active"] is True
|
||||||
|
|
||||||
|
|
||||||
|
# ── Feature importance endpoint ──────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_feature_importance_heuristic():
|
||||||
|
"""Dedicated endpoint returns heuristic drivers when no models are loaded."""
|
||||||
|
resp = client.get("/avm/v2/feature-importance")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
|
||||||
|
assert data["source"] == "heuristic"
|
||||||
|
assert data["model_version"] == "ensemble-v2-heuristic"
|
||||||
|
drivers = data["drivers"]
|
||||||
|
assert len(drivers) > 0
|
||||||
|
importances = [d["importance"] for d in drivers]
|
||||||
|
assert importances == sorted(importances, reverse=True)
|
||||||
|
assert all(0 <= i <= 1 for i in importances)
|
||||||
|
feature_names = {d["feature"] for d in drivers}
|
||||||
|
assert "area_m2" in feature_names
|
||||||
|
assert "neighborhood_score" in feature_names
|
||||||
|
|
||||||
|
|
||||||
# ── Model versioning ────────────────────────────────────────────
|
# ── Model versioning ────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user