From 9eaec46a3725acef08e326d1485bbba6e9eeafa5 Mon Sep 17 00:00:00 2001 From: Ho Ngoc Hai Date: Thu, 16 Apr 2026 17:55:03 +0700 Subject: [PATCH] =?UTF-8?q?feat(ai-services):=20AVM=20v2=20residential=20?= =?UTF-8?q?=E2=80=94=20expanded=20features,=20training=20pipeline,=20model?= =?UTF-8?q?=20versioning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add neighborhood_score, developer_reputation, floor_level, direction premiums to the multi-model ensemble. Implement real Optuna-based training pipeline for XGBoost/LightGBM/CatBoost with grouped train/val/test splits. Add file-based model registry with rollback and list-versions endpoints. 23 Python tests covering all new features. Co-Authored-By: Paperclip --- libs/ai-services/app/models/avm_v2.py | 33 + libs/ai-services/app/routers/avm_v2.py | 26 +- .../app/services/avm_v2_service.py | 614 ++++++++++++++++-- libs/ai-services/tests/test_avm_v2.py | 126 +++- 4 files changed, 743 insertions(+), 56 deletions(-) diff --git a/libs/ai-services/app/models/avm_v2.py b/libs/ai-services/app/models/avm_v2.py index 735c677..a4e57fd 100644 --- a/libs/ai-services/app/models/avm_v2.py +++ b/libs/ai-services/app/models/avm_v2.py @@ -29,10 +29,28 @@ class AVMv2PredictRequest(BaseModel): 0.0, ge=0, le=1, description="Flood zone risk score (0=safe, 1=high risk)" ) + # ── Neighborhood features ───────────────────────────── + neighborhood_score: float = Field( + 0.5, ge=0, le=1, + description="Overall neighborhood quality score (0-1, aggregated from safety, amenities, walkability)", + ) + # ── Physical features ────────────────────────────────── property_type: str = Field(..., description="e.g. apartment, house, villa, land") area_m2: float = Field(..., gt=0, description="Property area in m²") rooms: int = Field(0, ge=0, description="Total rooms (bedrooms)") + floor_level: int = Field( + 0, ge=0, + description="Floor level (0=ground or N/A, relevant for apartments/penthouses)", + ) + total_floors: int = Field( + 0, ge=0, + description="Total floors in the building (0=N/A)", + ) + direction: str = Field( + "unknown", + description="Facing direction: north, south, east, west, northeast, northwest, southeast, southwest, unknown", + ) floor_ratio: float = Field( 1.0, gt=0, description="Total floor area / land area ratio" ) @@ -41,6 +59,10 @@ class AVMv2PredictRequest(BaseModel): has_parking: bool = Field(False, description="Property has dedicated parking") has_pool: bool = Field(False, description="Property has swimming pool") has_legal_paper: bool = Field(True, description="Has sổ đỏ/sổ hồng") + developer_reputation: float = Field( + 0.5, ge=0, le=1, + description="Project developer reputation score (0-1, based on past projects, delivery record)", + ) # ── Market features ──────────────────────────────────── avg_price_district_3m_vnd_m2: float = Field( @@ -185,6 +207,12 @@ class AVMv2ModelInfo(BaseModel): ab_test_traffic_pct: float = Field(0.0, ge=0, le=1) +class AVMv2RollbackRequest(BaseModel): + """Request to rollback to a specific model version.""" + + target_version: str = Field(..., min_length=1, description="Model version to roll back to") + + class AVMv1Summary(BaseModel): """Compact summary of a v1 prediction for comparison.""" @@ -220,13 +248,18 @@ class ABComparisonRequest(BaseModel): frontage: float = Field(0.0, ge=0) has_legal_paper: bool = Field(True) # v2-specific features (optional, defaults applied) + neighborhood_score: float = Field(0.5, ge=0, le=1) distance_to_cbd_km: float = Field(0.0, ge=0) distance_to_metro_km: float = Field(0.0, ge=0) flood_zone_risk: float = Field(0.0, ge=0, le=1) building_age_years: int = Field(0, ge=0) + floor_level: int = Field(0, ge=0) + total_floors: int = Field(0, ge=0) + direction: str = Field("unknown") has_elevator: bool = Field(False) has_parking: bool = Field(False) has_pool: bool = Field(False) + developer_reputation: float = Field(0.5, ge=0, le=1) renovation_score: float = Field(0.5, ge=0, le=1) view_quality: float = Field(0.5, ge=0, le=1) interior_quality: float = Field(0.5, ge=0, le=1) diff --git a/libs/ai-services/app/routers/avm_v2.py b/libs/ai-services/app/routers/avm_v2.py index 584714b..afa50ae 100644 --- a/libs/ai-services/app/routers/avm_v2.py +++ b/libs/ai-services/app/routers/avm_v2.py @@ -1,6 +1,6 @@ """AVM v2 ensemble router — residential property valuation.""" -from fastapi import APIRouter +from fastapi import APIRouter, HTTPException from app.models.avm_v2 import ( ABComparisonRequest, @@ -8,6 +8,7 @@ from app.models.avm_v2 import ( AVMv2ModelInfo, AVMv2PredictRequest, AVMv2PredictResponse, + AVMv2RollbackRequest, AVMv2TrainRequest, AVMv2TrainResponse, ) @@ -30,7 +31,9 @@ def predict_v2(req: AVMv2PredictRequest) -> AVMv2PredictResponse: def train_v2(req: AVMv2TrainRequest) -> AVMv2TrainResponse: """Trigger model retraining with Optuna hyperparameter optimization. - Requires training data pipeline (Phase 3). Currently returns scaffold. + Loads training data from the model directory, runs Optuna for each + model in the ensemble, saves versioned artifacts, and registers + the new version in the model registry. """ return avm_v2_service.train(req) @@ -49,3 +52,22 @@ def compare_v1(req: ABComparisonRequest) -> ABComparisonResponse: def model_info_v2() -> AVMv2ModelInfo: """Get current active ensemble model information.""" return avm_v2_service.get_model_info() + + +@router.get("/versions", response_model=list[AVMv2ModelInfo]) +def list_versions() -> list[AVMv2ModelInfo]: + """List all registered model versions with their metrics and status.""" + return avm_v2_service.list_versions() + + +@router.post("/rollback", response_model=AVMv2ModelInfo) +def rollback(req: AVMv2RollbackRequest) -> AVMv2ModelInfo: + """Rollback to a previously trained model version. + + Copies the target version's artifacts to the active model directory, + reloads models, and updates the registry. + """ + try: + return avm_v2_service.rollback(req.target_version) + except ValueError as e: + raise HTTPException(status_code=404, detail=str(e)) diff --git a/libs/ai-services/app/services/avm_v2_service.py b/libs/ai-services/app/services/avm_v2_service.py index e1d6e87..9e8041d 100644 --- a/libs/ai-services/app/services/avm_v2_service.py +++ b/libs/ai-services/app/services/avm_v2_service.py @@ -5,9 +5,12 @@ Ensemble weights: XGBoost 0.4, LightGBM 0.35, CatBoost 0.25. Confidence = 1 - CV(3 predictions), where CV = std / mean. """ +import json import logging import os +import shutil from datetime import datetime, timezone +from pathlib import Path from typing import Any import numpy as np @@ -47,16 +50,22 @@ FEATURE_NAMES = [ "distance_to_park_km", "distance_to_mall_km", "flood_zone_risk", - # Physical (8) + # Neighborhood (1) + "neighborhood_score", + # Physical (13) "property_type_encoded", "area_m2", "rooms", + "floor_level", + "total_floors", + "direction_encoded", "floor_ratio", "building_age_years", "has_elevator", "has_parking", "has_pool", "has_legal_paper", + "developer_reputation", # Market (6) "avg_price_district_3m_vnd_m2", "listing_density", @@ -76,6 +85,18 @@ FEATURE_NAMES = [ "is_year_end", ] +DIRECTION_MAP = { + "south": 0, + "southeast": 1, + "east": 2, + "southwest": 3, + "northeast": 4, + "west": 5, + "northwest": 6, + "north": 7, + "unknown": 4, # neutral mid-value +} + PROPERTY_TYPE_MAP = { "apartment": 0, "house": 1, @@ -106,7 +127,7 @@ def _encode_features(req: AVMv2PredictRequest) -> np.ndarray: month_rad = 2 * np.pi * req.month / 12.0 return np.array( [[ - # Location + # Location (7) req.distance_to_cbd_km, req.distance_to_metro_km, req.distance_to_school_km, @@ -114,30 +135,36 @@ def _encode_features(req: AVMv2PredictRequest) -> np.ndarray: req.distance_to_park_km, req.distance_to_mall_km, req.flood_zone_risk, - # Physical + # Neighborhood (1) + req.neighborhood_score, + # Physical (13) PROPERTY_TYPE_MAP.get(req.property_type.lower(), 1), req.area_m2, req.rooms, + float(req.floor_level), + float(req.total_floors), + float(DIRECTION_MAP.get(req.direction.lower(), 4)), req.floor_ratio, req.building_age_years, 1.0 if req.has_elevator else 0.0, 1.0 if req.has_parking else 0.0, 1.0 if req.has_pool else 0.0, 1.0 if req.has_legal_paper else 0.0, - # Market + req.developer_reputation, + # Market (6) req.avg_price_district_3m_vnd_m2, req.listing_density, req.absorption_rate, req.dom_avg, req.price_momentum_30d, req.yoy_change, - # LLM-extracted + # LLM-extracted (5) req.renovation_score, req.view_quality, req.interior_quality, req.noise_level, req.natural_light, - # Temporal + # Temporal (3) np.sin(month_rad), np.cos(month_rad), 1.0 if req.is_year_end else 0.0, @@ -319,6 +346,9 @@ class AVMv2EnsembleService: metro_adj = 1.0 + max(0.0, (2.0 - req.distance_to_metro_km) * 0.05) flood_adj = 1.0 - req.flood_zone_risk * 0.15 + # Neighborhood adjustment: ±15% swing around 0.5 midpoint + neighborhood_adj = 1.0 + (req.neighborhood_score - 0.5) * 0.30 + # Physical adjustments room_adj = 1.0 + req.rooms * 0.015 age_adj = max(0.75, 1.0 - req.building_age_years * 0.008) @@ -330,6 +360,34 @@ class AVMv2EnsembleService: ) legal_adj = 1.0 if req.has_legal_paper else 0.70 + # Floor level premium (apartments/penthouses: higher floors = premium) + floor_adj = 1.0 + if req.floor_level > 0 and req.property_type.lower() in ("apartment", "penthouse"): + if req.total_floors > 0: + relative_floor = req.floor_level / req.total_floors + # Mid-to-high floors get up to +8% premium, ground floor -3% + floor_adj = 1.0 + (relative_floor - 0.3) * 0.12 + floor_adj = max(0.97, min(1.08, floor_adj)) + else: + # No total_floors info: mild premium for higher floors + floor_adj = min(1.08, 1.0 + req.floor_level * 0.003) + + # Direction premium (Vietnamese preference: south/southeast best) + direction_adj = { + "south": 1.05, + "southeast": 1.04, + "east": 1.02, + "southwest": 1.01, + "northeast": 1.0, + "west": 0.98, + "northwest": 0.97, + "north": 0.96, + "unknown": 1.0, + }.get(req.direction.lower(), 1.0) + + # Developer reputation: ±10% swing + developer_adj = 1.0 + (req.developer_reputation - 0.5) * 0.20 + # Market adjustments if req.avg_price_district_3m_vnd_m2 > 0: market_adj = req.avg_price_district_3m_vnd_m2 / (base * 1_000_000) @@ -357,10 +415,14 @@ class AVMv2EnsembleService: * cbd_adj * metro_adj * flood_adj + * neighborhood_adj * room_adj * age_adj * amenity_adj * legal_adj + * floor_adj + * direction_adj + * developer_adj * market_adj * momentum_adj * quality_adj @@ -407,16 +469,20 @@ class AVMv2EnsembleService: # Heuristic driver ranking drivers = [ - AVMv2FeatureImportance(feature="area_m2", importance=0.18), - AVMv2FeatureImportance(feature="avg_price_district_3m_vnd_m2", importance=0.15), - AVMv2FeatureImportance(feature="property_type_encoded", importance=0.12), - AVMv2FeatureImportance(feature="distance_to_cbd_km", importance=0.10), - AVMv2FeatureImportance(feature="renovation_score", importance=0.08), - AVMv2FeatureImportance(feature="building_age_years", importance=0.07), - AVMv2FeatureImportance(feature="has_legal_paper", importance=0.06), - AVMv2FeatureImportance(feature="distance_to_metro_km", importance=0.05), - AVMv2FeatureImportance(feature="interior_quality", importance=0.05), - AVMv2FeatureImportance(feature="price_momentum_30d", importance=0.04), + AVMv2FeatureImportance(feature="area_m2", importance=0.14), + AVMv2FeatureImportance(feature="avg_price_district_3m_vnd_m2", importance=0.12), + AVMv2FeatureImportance(feature="neighborhood_score", importance=0.10), + AVMv2FeatureImportance(feature="property_type_encoded", importance=0.10), + AVMv2FeatureImportance(feature="distance_to_cbd_km", importance=0.08), + AVMv2FeatureImportance(feature="developer_reputation", importance=0.07), + AVMv2FeatureImportance(feature="renovation_score", importance=0.07), + AVMv2FeatureImportance(feature="building_age_years", importance=0.06), + AVMv2FeatureImportance(feature="direction_encoded", importance=0.05), + AVMv2FeatureImportance(feature="floor_level", importance=0.05), + AVMv2FeatureImportance(feature="has_legal_paper", importance=0.05), + AVMv2FeatureImportance(feature="distance_to_metro_km", importance=0.04), + AVMv2FeatureImportance(feature="interior_quality", importance=0.04), + AVMv2FeatureImportance(feature="price_momentum_30d", importance=0.03), ] return AVMv2PredictResponse( @@ -481,52 +547,455 @@ class AVMv2EnsembleService: # ── Training pipeline ─────────────────────────────────────── def train(self, req: AVMv2TrainRequest) -> AVMv2TrainResponse: - """Train the ensemble models. + """Train the ensemble models on available data. - In production, this loads training data from the database/MinIO, - performs 5-fold CV by district with Optuna hyperparameter optimization, - and saves versioned model artifacts. - - Currently returns a scaffold response. Real training requires - the data pipeline from Phase 3. + Pipeline: + 1. Load training data from CSV/database export + 2. Feature engineering (encode, normalize, cyclical) + 3. Train/val/test split stratified by district + 4. For each model: Optuna hyperparameter optimization + 5. Save versioned artifacts + register in model registry """ + from app.config import settings + version = f"ensemble-v2-{datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S')}" logger.info("Training AVM v2 ensemble — version %s, trials=%d", version, req.optuna_trials) - # TODO: Replace with actual training pipeline when data is available - # 1. Load data from PostgreSQL/MinIO - # 2. Feature engineering (encode categoricals, normalize, cyclical) - # 3. 80/10/10 split stratified by district - # 4. For each model (XGBoost, LightGBM, CatBoost): - # a. Optuna study with req.optuna_trials trials - # b. 5-fold CV grouped by district - # c. Train on best params - # 5. Save artifacts to MinIO with version tag - # 6. Register in model registry + model_dir = Path(settings.model_path) + data_path = model_dir / "training_data.csv" + + # Check for training data + if not data_path.exists(): + logger.warning("No training data found at %s — returning scaffold", data_path) + return AVMv2TrainResponse( + model_version=version, + metrics={"mae": 0.0, "mape": 0.0, "rmse": 0.0, "r2": 0.0}, + district_metrics={}, + training_samples=0, + validation_samples=0, + test_samples=0, + best_params={}, + ) + + # Load and prepare data + import pandas as pd + from sklearn.model_selection import GroupShuffleSplit + + df = pd.read_csv(data_path) + logger.info("Loaded %d training samples", len(df)) + + # Feature engineering + X, y, groups = self._prepare_training_data(df) + if len(X) < 50: + logger.warning("Insufficient training data (%d samples)", len(X)) + return AVMv2TrainResponse( + model_version=version, + metrics={"mae": 0.0, "mape": 0.0, "rmse": 0.0, "r2": 0.0}, + district_metrics={}, + training_samples=len(X), + validation_samples=0, + test_samples=0, + best_params={}, + ) + + # Split: train/val/test grouped by district + gss_test = GroupShuffleSplit(n_splits=1, test_size=req.test_size, random_state=42) + train_val_idx, test_idx = next(gss_test.split(X, y, groups)) + X_trainval, y_trainval = X[train_val_idx], y[train_val_idx] + X_test, y_test = X[test_idx], y[test_idx] + groups_trainval = groups[train_val_idx] + + val_ratio = req.val_size / (1.0 - req.test_size) + gss_val = GroupShuffleSplit(n_splits=1, test_size=val_ratio, random_state=42) + train_idx, val_idx = next(gss_val.split(X_trainval, y_trainval, groups_trainval)) + X_train, y_train = X_trainval[train_idx], y_trainval[train_idx] + X_val, y_val = X_trainval[val_idx], y_trainval[val_idx] + + logger.info("Split: train=%d, val=%d, test=%d", len(X_train), len(X_val), len(X_test)) + + # Train each model with Optuna + best_params: dict[str, dict] = {} + trained_models: dict[str, Any] = {} + + xgb_params, xgb_model = self._train_xgboost(X_train, y_train, X_val, y_val, req.optuna_trials) + if xgb_model: + best_params["xgboost"] = xgb_params + trained_models["xgboost"] = xgb_model + + lgb_params, lgb_model = self._train_lightgbm(X_train, y_train, X_val, y_val, req.optuna_trials) + if lgb_model: + best_params["lightgbm"] = lgb_params + trained_models["lightgbm"] = lgb_model + + cat_params, cat_model = self._train_catboost(X_train, y_train, X_val, y_val, req.optuna_trials) + if cat_model: + best_params["catboost"] = cat_params + trained_models["catboost"] = cat_model + + # Evaluate ensemble on test set + metrics = self._evaluate_ensemble(trained_models, X_test, y_test) + + # Save versioned artifacts + version_dir = model_dir / "versions" / version + version_dir.mkdir(parents=True, exist_ok=True) + + for name, model in trained_models.items(): + self._save_model(name, model, version_dir) + # Also save to active model directory + self._save_model(name, model, model_dir) + + # Register in model registry + registry_entry = AVMv2ModelInfo( + model_version=version, + created_at=datetime.now(timezone.utc).isoformat(), + metrics=metrics, + is_active=True, + ab_test_traffic_pct=0.0, + ) + self._register_model(registry_entry, model_dir) + + # Reload models + self._models = trained_models + self._model_version = version return AVMv2TrainResponse( model_version=version, - metrics={ - "mae": 0.0, - "mape": 0.0, - "rmse": 0.0, - "r2": 0.0, - }, + metrics=metrics, district_metrics={}, - training_samples=0, - validation_samples=0, - test_samples=0, - best_params={ - "xgboost": {"n_estimators": 500, "max_depth": 6, "learning_rate": 0.05}, - "lightgbm": {"n_estimators": 500, "num_leaves": 31, "learning_rate": 0.05}, - "catboost": {"iterations": 500, "depth": 6, "learning_rate": 0.05}, - }, + training_samples=len(X_train), + validation_samples=len(X_val), + test_samples=len(X_test), + best_params=best_params, ) + def _prepare_training_data( + self, df: "pd.DataFrame" + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """Encode a DataFrame into feature matrix, target vector, and group labels.""" + import pandas as pd # noqa: F811 + + feature_cols = [ + "distance_to_cbd_km", "distance_to_metro_km", "distance_to_school_km", + "distance_to_hospital_km", "distance_to_park_km", "distance_to_mall_km", + "flood_zone_risk", "neighborhood_score", + "property_type", "area_m2", "rooms", "floor_level", "total_floors", + "direction", "floor_ratio", "building_age_years", + "has_elevator", "has_parking", "has_pool", "has_legal_paper", + "developer_reputation", + "avg_price_district_3m_vnd_m2", "listing_density", "absorption_rate", + "dom_avg", "price_momentum_30d", "yoy_change", + "renovation_score", "view_quality", "interior_quality", + "noise_level", "natural_light", + "month", + ] + + # Fill missing columns with defaults + for col in feature_cols: + if col not in df.columns: + df[col] = 0.0 if col not in ("property_type", "direction") else "unknown" + + # Encode categoricals + df["property_type_encoded"] = df["property_type"].str.lower().map(PROPERTY_TYPE_MAP).fillna(1) + df["direction_encoded"] = df["direction"].str.lower().map(DIRECTION_MAP).fillna(4) + + # Cyclical month encoding + month_rad = 2 * np.pi * df["month"].astype(float) / 12.0 + df["month_sin"] = np.sin(month_rad) + df["month_cos"] = np.cos(month_rad) + df["is_year_end_encoded"] = (df["month"].astype(int).isin([10, 11, 12])).astype(float) + + # Boolean encoding + for col in ["has_elevator", "has_parking", "has_pool", "has_legal_paper"]: + df[col] = df[col].astype(float) + + encoded_feature_cols = [ + "distance_to_cbd_km", "distance_to_metro_km", "distance_to_school_km", + "distance_to_hospital_km", "distance_to_park_km", "distance_to_mall_km", + "flood_zone_risk", "neighborhood_score", + "property_type_encoded", "area_m2", "rooms", "floor_level", "total_floors", + "direction_encoded", "floor_ratio", "building_age_years", + "has_elevator", "has_parking", "has_pool", "has_legal_paper", + "developer_reputation", + "avg_price_district_3m_vnd_m2", "listing_density", "absorption_rate", + "dom_avg", "price_momentum_30d", "yoy_change", + "renovation_score", "view_quality", "interior_quality", + "noise_level", "natural_light", + "month_sin", "month_cos", "is_year_end_encoded", + ] + + X = df[encoded_feature_cols].values.astype(np.float64) + y = np.log(df["price_vnd"].values.astype(np.float64)) # Log-price target + groups = df.get("district", pd.Series(["default"] * len(df))).values + + return X, y, groups + + def _train_xgboost( + self, + X_train: np.ndarray, y_train: np.ndarray, + X_val: np.ndarray, y_val: np.ndarray, + n_trials: int, + ) -> tuple[dict, Any]: + """Train XGBoost with Optuna hyperparameter optimization.""" + try: + import optuna + import xgboost as xgb + + optuna.logging.set_verbosity(optuna.logging.WARNING) + + dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=FEATURE_NAMES) + dval = xgb.DMatrix(X_val, label=y_val, feature_names=FEATURE_NAMES) + + def objective(trial: optuna.Trial) -> float: + params = { + "objective": "reg:squarederror", + "eval_metric": "rmse", + "max_depth": trial.suggest_int("max_depth", 3, 10), + "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True), + "subsample": trial.suggest_float("subsample", 0.6, 1.0), + "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0), + "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), + "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True), + "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True), + "verbosity": 0, + } + n_rounds = trial.suggest_int("n_rounds", 100, 1000) + model = xgb.train( + params, dtrain, num_boost_round=n_rounds, + evals=[(dval, "val")], verbose_eval=False, + early_stopping_rounds=50, + ) + preds = model.predict(dval) + rmse = float(np.sqrt(np.mean((preds - y_val) ** 2))) + return rmse + + study = optuna.create_study(direction="minimize") + study.optimize(objective, n_trials=n_trials, show_progress_bar=False) + + # Retrain with best params on full train set + best = study.best_params + n_rounds = best.pop("n_rounds", 500) + best.update({"objective": "reg:squarederror", "eval_metric": "rmse", "verbosity": 0}) + model = xgb.train( + best, dtrain, num_boost_round=n_rounds, + evals=[(dval, "val")], verbose_eval=False, + early_stopping_rounds=50, + ) + logger.info("XGBoost trained — best RMSE: %.4f", study.best_value) + return best, model + + except Exception as e: + logger.warning("XGBoost training failed: %s", e) + return {}, None + + def _train_lightgbm( + self, + X_train: np.ndarray, y_train: np.ndarray, + X_val: np.ndarray, y_val: np.ndarray, + n_trials: int, + ) -> tuple[dict, Any]: + """Train LightGBM with Optuna hyperparameter optimization.""" + try: + import lightgbm as lgb + import optuna + + optuna.logging.set_verbosity(optuna.logging.WARNING) + + dtrain = lgb.Dataset(X_train, label=y_train, feature_name=FEATURE_NAMES) + dval = lgb.Dataset(X_val, label=y_val, feature_name=FEATURE_NAMES, reference=dtrain) + + def objective(trial: optuna.Trial) -> float: + params = { + "objective": "regression", + "metric": "rmse", + "num_leaves": trial.suggest_int("num_leaves", 15, 127), + "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True), + "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0), + "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0), + "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), + "min_child_samples": trial.suggest_int("min_child_samples", 5, 50), + "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True), + "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True), + "verbosity": -1, + } + n_rounds = trial.suggest_int("n_rounds", 100, 1000) + callbacks = [lgb.early_stopping(50, verbose=False), lgb.log_evaluation(period=0)] + model = lgb.train( + params, dtrain, num_boost_round=n_rounds, + valid_sets=[dval], callbacks=callbacks, + ) + preds = model.predict(X_val) + rmse = float(np.sqrt(np.mean((preds - y_val) ** 2))) + return rmse + + study = optuna.create_study(direction="minimize") + study.optimize(objective, n_trials=n_trials, show_progress_bar=False) + + best = study.best_params + n_rounds = best.pop("n_rounds", 500) + best.update({"objective": "regression", "metric": "rmse", "verbosity": -1}) + callbacks = [lgb.early_stopping(50, verbose=False), lgb.log_evaluation(period=0)] + model = lgb.train( + best, dtrain, num_boost_round=n_rounds, + valid_sets=[dval], callbacks=callbacks, + ) + logger.info("LightGBM trained — best RMSE: %.4f", study.best_value) + return best, model + + except Exception as e: + logger.warning("LightGBM training failed: %s", e) + return {}, None + + def _train_catboost( + self, + X_train: np.ndarray, y_train: np.ndarray, + X_val: np.ndarray, y_val: np.ndarray, + n_trials: int, + ) -> tuple[dict, Any]: + """Train CatBoost with Optuna hyperparameter optimization.""" + try: + import optuna + from catboost import CatBoostRegressor, Pool + + optuna.logging.set_verbosity(optuna.logging.WARNING) + + train_pool = Pool(X_train, label=y_train, feature_names=FEATURE_NAMES) + val_pool = Pool(X_val, label=y_val, feature_names=FEATURE_NAMES) + + def objective(trial: optuna.Trial) -> float: + params = { + "iterations": trial.suggest_int("iterations", 100, 1000), + "depth": trial.suggest_int("depth", 4, 10), + "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True), + "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True), + "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0), + "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True), + "verbose": 0, + "loss_function": "RMSE", + "early_stopping_rounds": 50, + } + model = CatBoostRegressor(**params) + model.fit(train_pool, eval_set=val_pool, verbose=0) + preds = model.predict(val_pool) + rmse = float(np.sqrt(np.mean((preds - y_val) ** 2))) + return rmse + + study = optuna.create_study(direction="minimize") + study.optimize(objective, n_trials=n_trials, show_progress_bar=False) + + best = study.best_params + best.update({"verbose": 0, "loss_function": "RMSE", "early_stopping_rounds": 50}) + model = CatBoostRegressor(**best) + model.fit(train_pool, eval_set=val_pool, verbose=0) + logger.info("CatBoost trained — best RMSE: %.4f", study.best_value) + return best, model + + except Exception as e: + logger.warning("CatBoost training failed: %s", e) + return {}, None + + def _evaluate_ensemble( + self, models: dict[str, Any], X_test: np.ndarray, y_test: np.ndarray + ) -> dict: + """Evaluate ensemble performance on a test set.""" + if not models: + return {"mae": 0.0, "mape": 0.0, "rmse": 0.0, "r2": 0.0} + + predictions = [] + weights = [] + for name, model in models.items(): + w = ENSEMBLE_WEIGHTS.get(name, 0.0) + features = X_test + if name == "xgboost": + import xgboost as xgb + preds = model.predict(xgb.DMatrix(features, feature_names=FEATURE_NAMES)) + elif name == "lightgbm": + preds = model.predict(features) + elif name == "catboost": + preds = model.predict(features) + else: + continue + predictions.append(preds * w) + weights.append(w) + + total_weight = sum(weights) or 1.0 + ensemble_preds = sum(predictions) / total_weight + + # Metrics in log-space then convert + y_actual = np.exp(y_test) + y_pred = np.exp(ensemble_preds) + + mae = float(np.mean(np.abs(y_actual - y_pred))) + mape = float(np.mean(np.abs((y_actual - y_pred) / y_actual))) * 100 + rmse = float(np.sqrt(np.mean((y_actual - y_pred) ** 2))) + ss_res = np.sum((y_actual - y_pred) ** 2) + ss_tot = np.sum((y_actual - np.mean(y_actual)) ** 2) + r2 = float(1.0 - ss_res / ss_tot) if ss_tot > 0 else 0.0 + + return { + "mae": round(mae, 2), + "mape": round(mape, 2), + "rmse": round(rmse, 2), + "r2": round(r2, 4), + } + + def _save_model(self, name: str, model: Any, directory: Path) -> None: + """Save a trained model to the specified directory.""" + if name == "xgboost": + model.save_model(str(directory / "avm_v2_xgboost.json")) + elif name == "lightgbm": + model.save_model(str(directory / "avm_v2_lightgbm.txt")) + elif name == "catboost": + model.save_model(str(directory / "avm_v2_catboost.cbm")) + # ── Model registry ────────────────────────────────────────── + def _get_registry_path(self, model_dir: Path | None = None) -> Path: + """Get the path to the model registry JSON file.""" + if model_dir is None: + from app.config import settings + model_dir = Path(settings.model_path) + return model_dir / "model_registry.json" + + def _load_registry(self, model_dir: Path | None = None) -> list[dict]: + """Load the model registry from disk.""" + path = self._get_registry_path(model_dir) + if path.exists(): + with open(path) as f: + return json.load(f) + return [] + + def _save_registry(self, entries: list[dict], model_dir: Path | None = None) -> None: + """Save the model registry to disk.""" + path = self._get_registry_path(model_dir) + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + json.dump(entries, f, indent=2) + + def _register_model(self, info: AVMv2ModelInfo, model_dir: Path) -> None: + """Register a new model version and mark it as active.""" + entries = self._load_registry(model_dir) + # Deactivate previous active models + for entry in entries: + entry["is_active"] = False + entries.append({ + "model_version": info.model_version, + "created_at": info.created_at, + "metrics": info.metrics, + "is_active": True, + "ab_test_traffic_pct": info.ab_test_traffic_pct, + }) + self._save_registry(entries, model_dir) + self._model_registry = [ + AVMv2ModelInfo(**e) for e in entries + ] + def get_model_info(self) -> AVMv2ModelInfo: """Return current active model information.""" + # Check registry for active model + entries = self._load_registry() + for entry in reversed(entries): + if entry.get("is_active"): + return AVMv2ModelInfo(**entry) return AVMv2ModelInfo( model_version=self._model_version, created_at=datetime.now(timezone.utc).isoformat(), @@ -535,6 +1004,52 @@ class AVMv2EnsembleService: ab_test_traffic_pct=0.0, ) + def list_versions(self) -> list[AVMv2ModelInfo]: + """List all registered model versions.""" + entries = self._load_registry() + return [AVMv2ModelInfo(**e) for e in entries] + + def rollback(self, target_version: str) -> AVMv2ModelInfo: + """Rollback to a previously trained model version. + + Copies the target version's artifacts to the active model directory + and updates the registry. + """ + from app.config import settings + + model_dir = Path(settings.model_path) + version_dir = model_dir / "versions" / target_version + + if not version_dir.exists(): + raise ValueError(f"Model version {target_version} not found") + + # Copy versioned artifacts to active directory + for artifact in version_dir.iterdir(): + if artifact.is_file(): + shutil.copy2(artifact, model_dir / artifact.name) + + # Update registry + entries = self._load_registry(model_dir) + found = False + for entry in entries: + entry["is_active"] = entry["model_version"] == target_version + if entry["model_version"] == target_version: + found = True + + if not found: + raise ValueError(f"Model version {target_version} not in registry") + + self._save_registry(entries, model_dir) + + # Reload models from disk + self._models = {} + self._load_models() + self._model_version = target_version + + logger.info("Rolled back to model version %s", target_version) + active = next(e for e in entries if e["is_active"]) + return AVMv2ModelInfo(**active) + # ── A/B comparison ───────────────────────────────────────── def compare_v1(self, req: ABComparisonRequest) -> ABComparisonResponse: @@ -562,13 +1077,18 @@ class AVMv2EnsembleService: area_m2=req.area_m2, rooms=req.rooms or req.bedrooms, has_legal_paper=req.has_legal_paper, + neighborhood_score=req.neighborhood_score, distance_to_cbd_km=req.distance_to_cbd_km, distance_to_metro_km=req.distance_to_metro_km, flood_zone_risk=req.flood_zone_risk, building_age_years=req.building_age_years, + floor_level=req.floor_level, + total_floors=req.total_floors, + direction=req.direction, has_elevator=req.has_elevator, has_parking=req.has_parking, has_pool=req.has_pool, + developer_reputation=req.developer_reputation, renovation_score=req.renovation_score, view_quality=req.view_quality, interior_quality=req.interior_quality, diff --git a/libs/ai-services/tests/test_avm_v2.py b/libs/ai-services/tests/test_avm_v2.py index 5bc5acf..d979947 100644 --- a/libs/ai-services/tests/test_avm_v2.py +++ b/libs/ai-services/tests/test_avm_v2.py @@ -65,9 +65,10 @@ def test_predict_v2_returns_drivers(): def test_predict_v2_with_full_features(): - """Predict with all features populated.""" + """Predict with all features populated (including new v2 features).""" payload = { **_PREDICT_PAYLOAD, + "neighborhood_score": 0.8, "distance_to_cbd_km": 5.0, "distance_to_metro_km": 0.8, "distance_to_school_km": 0.5, @@ -75,11 +76,15 @@ def test_predict_v2_with_full_features(): "distance_to_park_km": 0.3, "distance_to_mall_km": 1.0, "flood_zone_risk": 0.1, + "floor_level": 12, + "total_floors": 25, + "direction": "southeast", "floor_ratio": 1.2, "building_age_years": 5, "has_elevator": True, "has_parking": True, "has_pool": False, + "developer_reputation": 0.9, "avg_price_district_3m_vnd_m2": 85_000_000, "listing_density": 12.5, "absorption_rate": 0.3, @@ -149,8 +154,93 @@ def test_predict_v2_invalid_area(): assert resp.status_code == 422 -def test_train_v2_scaffold(): - """Training endpoint should return scaffold response.""" +# ── New v2 features: neighborhood, floor, direction, developer ── + + +def test_predict_v2_neighborhood_premium(): + """High neighborhood score should increase price.""" + low_nb = client.post( + "/avm/v2/predict", + json={**_PREDICT_PAYLOAD, "neighborhood_score": 0.2}, + ).json() + high_nb = client.post( + "/avm/v2/predict", + json={**_PREDICT_PAYLOAD, "neighborhood_score": 0.9}, + ).json() + + assert high_nb["estimated_price_vnd"] > low_nb["estimated_price_vnd"] + + +def test_predict_v2_floor_level_premium(): + """Higher floor apartments should command a premium.""" + ground = client.post( + "/avm/v2/predict", + json={**_PREDICT_PAYLOAD, "floor_level": 2, "total_floors": 25}, + ).json() + high = client.post( + "/avm/v2/predict", + json={**_PREDICT_PAYLOAD, "floor_level": 20, "total_floors": 25}, + ).json() + + assert high["estimated_price_vnd"] > ground["estimated_price_vnd"] + + +def test_predict_v2_direction_premium(): + """South-facing properties should be priced higher than north-facing.""" + south = client.post( + "/avm/v2/predict", + json={**_PREDICT_PAYLOAD, "direction": "south"}, + ).json() + north = client.post( + "/avm/v2/predict", + json={**_PREDICT_PAYLOAD, "direction": "north"}, + ).json() + + assert south["estimated_price_vnd"] > north["estimated_price_vnd"] + + +def test_predict_v2_developer_reputation(): + """Properties from reputable developers should be valued higher.""" + low_rep = client.post( + "/avm/v2/predict", + json={**_PREDICT_PAYLOAD, "developer_reputation": 0.2}, + ).json() + high_rep = client.post( + "/avm/v2/predict", + json={**_PREDICT_PAYLOAD, "developer_reputation": 0.9}, + ).json() + + assert high_rep["estimated_price_vnd"] > low_rep["estimated_price_vnd"] + + +def test_predict_v2_direction_defaults_unknown(): + """Unknown direction should not affect price (neutral).""" + explicit = client.post( + "/avm/v2/predict", + json={**_PREDICT_PAYLOAD, "direction": "unknown"}, + ).json() + default = client.post("/avm/v2/predict", json=_PREDICT_PAYLOAD).json() + + assert explicit["estimated_price_vnd"] == default["estimated_price_vnd"] + + +def test_predict_v2_drivers_include_new_features(): + """Drivers should include neighborhood_score, direction, floor_level.""" + resp = client.post("/avm/v2/predict", json=_PREDICT_PAYLOAD) + data = resp.json() + driver_names = {d["feature"] for d in data["drivers"]} + + assert "neighborhood_score" in driver_names + assert "direction_encoded" in driver_names + assert "floor_level" in driver_names + assert "developer_reputation" in driver_names + + +# ── Training & model info ─────────────────────────────────────── + + +def test_train_v2_no_data(): + """Training without data returns scaffold with zero metrics.""" resp = client.post( "/avm/v2/train", json={"optuna_trials": 10}, @@ -159,10 +249,7 @@ def test_train_v2_scaffold(): data = resp.json() assert "model_version" in data assert "ensemble-v2-" in data["model_version"] - assert data["metrics"]["mae"] == 0.0 # scaffold returns zeros - assert "xgboost" in data["best_params"] - assert "lightgbm" in data["best_params"] - assert "catboost" in data["best_params"] + assert data["training_samples"] == 0 def test_model_info_v2(): @@ -174,6 +261,26 @@ def test_model_info_v2(): assert data["is_active"] is True +# ── Model versioning ──────────────────────────────────────────── + + +def test_list_versions(): + """Versions endpoint returns a list.""" + resp = client.get("/avm/v2/versions") + assert resp.status_code == 200 + data = resp.json() + assert isinstance(data, list) + + +def test_rollback_not_found(): + """Rollback to non-existent version returns 404.""" + resp = client.post( + "/avm/v2/rollback", + json={"target_version": "nonexistent-version-xyz"}, + ) + assert resp.status_code == 404 + + # ── A/B comparison tests ───────────────────────────────────── _COMPARE_PAYLOAD = { @@ -227,12 +334,17 @@ def test_compare_v1_with_v2_features(): """Compare endpoint passes v2-specific features correctly.""" payload = { **_COMPARE_PAYLOAD, + "neighborhood_score": 0.8, "distance_to_cbd_km": 5.0, "distance_to_metro_km": 0.8, "flood_zone_risk": 0.1, "building_age_years": 3, + "floor_level": 15, + "total_floors": 30, + "direction": "southeast", "has_elevator": True, "has_parking": True, + "developer_reputation": 0.85, "renovation_score": 0.9, "view_quality": 0.8, "interior_quality": 0.85,