feat(ai-services): dedicated GET /avm/v2/feature-importance endpoint (TEC-2760)

Exposes ensemble feature importance as a standalone endpoint per R5.1 spec.
Aggregates XGBoost (0.4) + LightGBM (0.35) + CatBoost (0.25) gain when trained
boosters are loaded; falls back to the curated heuristic ranking otherwise, so
callers can depend on the endpoint during scaffold/heuristic-only runs.

- Factored heuristic drivers into a shared constant (_HEURISTIC_DRIVERS)
- Added AVMv2FeatureImportanceResponse model (model_version + source + drivers)
- Added service.get_feature_importance() public method
- Added tests/test_avm_v2.py::test_feature_importance_heuristic (24 total pass)

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Ho Ngoc Hai
2026-04-18 15:27:30 +07:00
parent 5731577fa9
commit 729afe2db6
4 changed files with 92 additions and 16 deletions

View File

@@ -22,6 +22,7 @@ from app.models.avm_v2 import (
AVMv1Summary,
AVMv2Comparable,
AVMv2FeatureImportance,
AVMv2FeatureImportanceResponse,
AVMv2ModelInfo,
AVMv2PredictRequest,
AVMv2PredictResponse,
@@ -121,6 +122,30 @@ CITY_BASELINE: dict[str, float] = {
}
DEFAULT_BASELINE = 30.0
# ── Heuristic feature importance ────────────────────────────────
# Used both inside heuristic predict responses and by the dedicated
# feature-importance endpoint when no trained booster is loaded.
_HEURISTIC_DRIVERS: list[tuple[str, float]] = [
("area_m2", 0.14),
("avg_price_district_3m_vnd_m2", 0.12),
("neighborhood_score", 0.10),
("property_type_encoded", 0.10),
("distance_to_cbd_km", 0.08),
("developer_reputation", 0.07),
("renovation_score", 0.07),
("building_age_years", 0.06),
("direction_encoded", 0.05),
("floor_level", 0.05),
("has_legal_paper", 0.05),
("distance_to_metro_km", 0.04),
("interior_quality", 0.04),
("price_momentum_30d", 0.03),
]
def _heuristic_drivers() -> list[AVMv2FeatureImportance]:
return [AVMv2FeatureImportance(feature=f, importance=w) for f, w in _HEURISTIC_DRIVERS]
def _encode_features(req: AVMv2PredictRequest) -> np.ndarray:
"""Encode a prediction request into a feature vector."""
@@ -468,22 +493,7 @@ class AVMv2EnsembleService:
confidence = max(0.0, min(1.0, 1.0 - cv))
# Heuristic driver ranking
drivers = [
AVMv2FeatureImportance(feature="area_m2", importance=0.14),
AVMv2FeatureImportance(feature="avg_price_district_3m_vnd_m2", importance=0.12),
AVMv2FeatureImportance(feature="neighborhood_score", importance=0.10),
AVMv2FeatureImportance(feature="property_type_encoded", importance=0.10),
AVMv2FeatureImportance(feature="distance_to_cbd_km", importance=0.08),
AVMv2FeatureImportance(feature="developer_reputation", importance=0.07),
AVMv2FeatureImportance(feature="renovation_score", importance=0.07),
AVMv2FeatureImportance(feature="building_age_years", importance=0.06),
AVMv2FeatureImportance(feature="direction_encoded", importance=0.05),
AVMv2FeatureImportance(feature="floor_level", importance=0.05),
AVMv2FeatureImportance(feature="has_legal_paper", importance=0.05),
AVMv2FeatureImportance(feature="distance_to_metro_km", importance=0.04),
AVMv2FeatureImportance(feature="interior_quality", importance=0.04),
AVMv2FeatureImportance(feature="price_momentum_30d", importance=0.03),
]
drivers = _heuristic_drivers()
return AVMv2PredictResponse(
estimated_price_vnd=round(estimated, -3),
@@ -544,6 +554,26 @@ class AVMv2EnsembleService:
for f, v in sorted_imp
]
def get_feature_importance(self) -> AVMv2FeatureImportanceResponse:
"""Return global feature importance for the active ensemble.
Prefers trained-booster importances (weighted gain aggregation). Falls
back to a curated heuristic ranking when no boosters are loaded so the
endpoint stays available during scaffolded / heuristic-only runs.
"""
drivers = self._get_feature_importance() if self._models else []
if drivers:
return AVMv2FeatureImportanceResponse(
model_version=self._model_version,
source="model",
drivers=drivers,
)
return AVMv2FeatureImportanceResponse(
model_version=self._model_version,
source="heuristic",
drivers=_heuristic_drivers(),
)
# ── Training pipeline ───────────────────────────────────────
def train(self, req: AVMv2TrainRequest) -> AVMv2TrainResponse: