feat(ai-services): AVM v2 residential — expanded features, training pipeline, model versioning
Add neighborhood_score, developer_reputation, floor_level, direction premiums to the multi-model ensemble. Implement real Optuna-based training pipeline for XGBoost/LightGBM/CatBoost with grouped train/val/test splits. Add file-based model registry with rollback and list-versions endpoints. 23 Python tests covering all new features. Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -29,10 +29,28 @@ class AVMv2PredictRequest(BaseModel):
|
||||
0.0, ge=0, le=1, description="Flood zone risk score (0=safe, 1=high risk)"
|
||||
)
|
||||
|
||||
# ── Neighborhood features ─────────────────────────────
|
||||
neighborhood_score: float = Field(
|
||||
0.5, ge=0, le=1,
|
||||
description="Overall neighborhood quality score (0-1, aggregated from safety, amenities, walkability)",
|
||||
)
|
||||
|
||||
# ── Physical features ──────────────────────────────────
|
||||
property_type: str = Field(..., description="e.g. apartment, house, villa, land")
|
||||
area_m2: float = Field(..., gt=0, description="Property area in m²")
|
||||
rooms: int = Field(0, ge=0, description="Total rooms (bedrooms)")
|
||||
floor_level: int = Field(
|
||||
0, ge=0,
|
||||
description="Floor level (0=ground or N/A, relevant for apartments/penthouses)",
|
||||
)
|
||||
total_floors: int = Field(
|
||||
0, ge=0,
|
||||
description="Total floors in the building (0=N/A)",
|
||||
)
|
||||
direction: str = Field(
|
||||
"unknown",
|
||||
description="Facing direction: north, south, east, west, northeast, northwest, southeast, southwest, unknown",
|
||||
)
|
||||
floor_ratio: float = Field(
|
||||
1.0, gt=0, description="Total floor area / land area ratio"
|
||||
)
|
||||
@@ -41,6 +59,10 @@ class AVMv2PredictRequest(BaseModel):
|
||||
has_parking: bool = Field(False, description="Property has dedicated parking")
|
||||
has_pool: bool = Field(False, description="Property has swimming pool")
|
||||
has_legal_paper: bool = Field(True, description="Has sổ đỏ/sổ hồng")
|
||||
developer_reputation: float = Field(
|
||||
0.5, ge=0, le=1,
|
||||
description="Project developer reputation score (0-1, based on past projects, delivery record)",
|
||||
)
|
||||
|
||||
# ── Market features ────────────────────────────────────
|
||||
avg_price_district_3m_vnd_m2: float = Field(
|
||||
@@ -185,6 +207,12 @@ class AVMv2ModelInfo(BaseModel):
|
||||
ab_test_traffic_pct: float = Field(0.0, ge=0, le=1)
|
||||
|
||||
|
||||
class AVMv2RollbackRequest(BaseModel):
|
||||
"""Request to rollback to a specific model version."""
|
||||
|
||||
target_version: str = Field(..., min_length=1, description="Model version to roll back to")
|
||||
|
||||
|
||||
class AVMv1Summary(BaseModel):
|
||||
"""Compact summary of a v1 prediction for comparison."""
|
||||
|
||||
@@ -220,13 +248,18 @@ class ABComparisonRequest(BaseModel):
|
||||
frontage: float = Field(0.0, ge=0)
|
||||
has_legal_paper: bool = Field(True)
|
||||
# v2-specific features (optional, defaults applied)
|
||||
neighborhood_score: float = Field(0.5, ge=0, le=1)
|
||||
distance_to_cbd_km: float = Field(0.0, ge=0)
|
||||
distance_to_metro_km: float = Field(0.0, ge=0)
|
||||
flood_zone_risk: float = Field(0.0, ge=0, le=1)
|
||||
building_age_years: int = Field(0, ge=0)
|
||||
floor_level: int = Field(0, ge=0)
|
||||
total_floors: int = Field(0, ge=0)
|
||||
direction: str = Field("unknown")
|
||||
has_elevator: bool = Field(False)
|
||||
has_parking: bool = Field(False)
|
||||
has_pool: bool = Field(False)
|
||||
developer_reputation: float = Field(0.5, ge=0, le=1)
|
||||
renovation_score: float = Field(0.5, ge=0, le=1)
|
||||
view_quality: float = Field(0.5, ge=0, le=1)
|
||||
interior_quality: float = Field(0.5, ge=0, le=1)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""AVM v2 ensemble router — residential property valuation."""
|
||||
|
||||
from fastapi import APIRouter
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
from app.models.avm_v2 import (
|
||||
ABComparisonRequest,
|
||||
@@ -8,6 +8,7 @@ from app.models.avm_v2 import (
|
||||
AVMv2ModelInfo,
|
||||
AVMv2PredictRequest,
|
||||
AVMv2PredictResponse,
|
||||
AVMv2RollbackRequest,
|
||||
AVMv2TrainRequest,
|
||||
AVMv2TrainResponse,
|
||||
)
|
||||
@@ -30,7 +31,9 @@ def predict_v2(req: AVMv2PredictRequest) -> AVMv2PredictResponse:
|
||||
def train_v2(req: AVMv2TrainRequest) -> AVMv2TrainResponse:
|
||||
"""Trigger model retraining with Optuna hyperparameter optimization.
|
||||
|
||||
Requires training data pipeline (Phase 3). Currently returns scaffold.
|
||||
Loads training data from the model directory, runs Optuna for each
|
||||
model in the ensemble, saves versioned artifacts, and registers
|
||||
the new version in the model registry.
|
||||
"""
|
||||
return avm_v2_service.train(req)
|
||||
|
||||
@@ -49,3 +52,22 @@ def compare_v1(req: ABComparisonRequest) -> ABComparisonResponse:
|
||||
def model_info_v2() -> AVMv2ModelInfo:
|
||||
"""Get current active ensemble model information."""
|
||||
return avm_v2_service.get_model_info()
|
||||
|
||||
|
||||
@router.get("/versions", response_model=list[AVMv2ModelInfo])
|
||||
def list_versions() -> list[AVMv2ModelInfo]:
|
||||
"""List all registered model versions with their metrics and status."""
|
||||
return avm_v2_service.list_versions()
|
||||
|
||||
|
||||
@router.post("/rollback", response_model=AVMv2ModelInfo)
|
||||
def rollback(req: AVMv2RollbackRequest) -> AVMv2ModelInfo:
|
||||
"""Rollback to a previously trained model version.
|
||||
|
||||
Copies the target version's artifacts to the active model directory,
|
||||
reloads models, and updates the registry.
|
||||
"""
|
||||
try:
|
||||
return avm_v2_service.rollback(req.target_version)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=404, detail=str(e))
|
||||
|
||||
@@ -5,9 +5,12 @@ Ensemble weights: XGBoost 0.4, LightGBM 0.35, CatBoost 0.25.
|
||||
Confidence = 1 - CV(3 predictions), where CV = std / mean.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -47,16 +50,22 @@ FEATURE_NAMES = [
|
||||
"distance_to_park_km",
|
||||
"distance_to_mall_km",
|
||||
"flood_zone_risk",
|
||||
# Physical (8)
|
||||
# Neighborhood (1)
|
||||
"neighborhood_score",
|
||||
# Physical (13)
|
||||
"property_type_encoded",
|
||||
"area_m2",
|
||||
"rooms",
|
||||
"floor_level",
|
||||
"total_floors",
|
||||
"direction_encoded",
|
||||
"floor_ratio",
|
||||
"building_age_years",
|
||||
"has_elevator",
|
||||
"has_parking",
|
||||
"has_pool",
|
||||
"has_legal_paper",
|
||||
"developer_reputation",
|
||||
# Market (6)
|
||||
"avg_price_district_3m_vnd_m2",
|
||||
"listing_density",
|
||||
@@ -76,6 +85,18 @@ FEATURE_NAMES = [
|
||||
"is_year_end",
|
||||
]
|
||||
|
||||
DIRECTION_MAP = {
|
||||
"south": 0,
|
||||
"southeast": 1,
|
||||
"east": 2,
|
||||
"southwest": 3,
|
||||
"northeast": 4,
|
||||
"west": 5,
|
||||
"northwest": 6,
|
||||
"north": 7,
|
||||
"unknown": 4, # neutral mid-value
|
||||
}
|
||||
|
||||
PROPERTY_TYPE_MAP = {
|
||||
"apartment": 0,
|
||||
"house": 1,
|
||||
@@ -106,7 +127,7 @@ def _encode_features(req: AVMv2PredictRequest) -> np.ndarray:
|
||||
month_rad = 2 * np.pi * req.month / 12.0
|
||||
return np.array(
|
||||
[[
|
||||
# Location
|
||||
# Location (7)
|
||||
req.distance_to_cbd_km,
|
||||
req.distance_to_metro_km,
|
||||
req.distance_to_school_km,
|
||||
@@ -114,30 +135,36 @@ def _encode_features(req: AVMv2PredictRequest) -> np.ndarray:
|
||||
req.distance_to_park_km,
|
||||
req.distance_to_mall_km,
|
||||
req.flood_zone_risk,
|
||||
# Physical
|
||||
# Neighborhood (1)
|
||||
req.neighborhood_score,
|
||||
# Physical (13)
|
||||
PROPERTY_TYPE_MAP.get(req.property_type.lower(), 1),
|
||||
req.area_m2,
|
||||
req.rooms,
|
||||
float(req.floor_level),
|
||||
float(req.total_floors),
|
||||
float(DIRECTION_MAP.get(req.direction.lower(), 4)),
|
||||
req.floor_ratio,
|
||||
req.building_age_years,
|
||||
1.0 if req.has_elevator else 0.0,
|
||||
1.0 if req.has_parking else 0.0,
|
||||
1.0 if req.has_pool else 0.0,
|
||||
1.0 if req.has_legal_paper else 0.0,
|
||||
# Market
|
||||
req.developer_reputation,
|
||||
# Market (6)
|
||||
req.avg_price_district_3m_vnd_m2,
|
||||
req.listing_density,
|
||||
req.absorption_rate,
|
||||
req.dom_avg,
|
||||
req.price_momentum_30d,
|
||||
req.yoy_change,
|
||||
# LLM-extracted
|
||||
# LLM-extracted (5)
|
||||
req.renovation_score,
|
||||
req.view_quality,
|
||||
req.interior_quality,
|
||||
req.noise_level,
|
||||
req.natural_light,
|
||||
# Temporal
|
||||
# Temporal (3)
|
||||
np.sin(month_rad),
|
||||
np.cos(month_rad),
|
||||
1.0 if req.is_year_end else 0.0,
|
||||
@@ -319,6 +346,9 @@ class AVMv2EnsembleService:
|
||||
metro_adj = 1.0 + max(0.0, (2.0 - req.distance_to_metro_km) * 0.05)
|
||||
flood_adj = 1.0 - req.flood_zone_risk * 0.15
|
||||
|
||||
# Neighborhood adjustment: ±15% swing around 0.5 midpoint
|
||||
neighborhood_adj = 1.0 + (req.neighborhood_score - 0.5) * 0.30
|
||||
|
||||
# Physical adjustments
|
||||
room_adj = 1.0 + req.rooms * 0.015
|
||||
age_adj = max(0.75, 1.0 - req.building_age_years * 0.008)
|
||||
@@ -330,6 +360,34 @@ class AVMv2EnsembleService:
|
||||
)
|
||||
legal_adj = 1.0 if req.has_legal_paper else 0.70
|
||||
|
||||
# Floor level premium (apartments/penthouses: higher floors = premium)
|
||||
floor_adj = 1.0
|
||||
if req.floor_level > 0 and req.property_type.lower() in ("apartment", "penthouse"):
|
||||
if req.total_floors > 0:
|
||||
relative_floor = req.floor_level / req.total_floors
|
||||
# Mid-to-high floors get up to +8% premium, ground floor -3%
|
||||
floor_adj = 1.0 + (relative_floor - 0.3) * 0.12
|
||||
floor_adj = max(0.97, min(1.08, floor_adj))
|
||||
else:
|
||||
# No total_floors info: mild premium for higher floors
|
||||
floor_adj = min(1.08, 1.0 + req.floor_level * 0.003)
|
||||
|
||||
# Direction premium (Vietnamese preference: south/southeast best)
|
||||
direction_adj = {
|
||||
"south": 1.05,
|
||||
"southeast": 1.04,
|
||||
"east": 1.02,
|
||||
"southwest": 1.01,
|
||||
"northeast": 1.0,
|
||||
"west": 0.98,
|
||||
"northwest": 0.97,
|
||||
"north": 0.96,
|
||||
"unknown": 1.0,
|
||||
}.get(req.direction.lower(), 1.0)
|
||||
|
||||
# Developer reputation: ±10% swing
|
||||
developer_adj = 1.0 + (req.developer_reputation - 0.5) * 0.20
|
||||
|
||||
# Market adjustments
|
||||
if req.avg_price_district_3m_vnd_m2 > 0:
|
||||
market_adj = req.avg_price_district_3m_vnd_m2 / (base * 1_000_000)
|
||||
@@ -357,10 +415,14 @@ class AVMv2EnsembleService:
|
||||
* cbd_adj
|
||||
* metro_adj
|
||||
* flood_adj
|
||||
* neighborhood_adj
|
||||
* room_adj
|
||||
* age_adj
|
||||
* amenity_adj
|
||||
* legal_adj
|
||||
* floor_adj
|
||||
* direction_adj
|
||||
* developer_adj
|
||||
* market_adj
|
||||
* momentum_adj
|
||||
* quality_adj
|
||||
@@ -407,16 +469,20 @@ class AVMv2EnsembleService:
|
||||
|
||||
# Heuristic driver ranking
|
||||
drivers = [
|
||||
AVMv2FeatureImportance(feature="area_m2", importance=0.18),
|
||||
AVMv2FeatureImportance(feature="avg_price_district_3m_vnd_m2", importance=0.15),
|
||||
AVMv2FeatureImportance(feature="property_type_encoded", importance=0.12),
|
||||
AVMv2FeatureImportance(feature="distance_to_cbd_km", importance=0.10),
|
||||
AVMv2FeatureImportance(feature="renovation_score", importance=0.08),
|
||||
AVMv2FeatureImportance(feature="building_age_years", importance=0.07),
|
||||
AVMv2FeatureImportance(feature="has_legal_paper", importance=0.06),
|
||||
AVMv2FeatureImportance(feature="distance_to_metro_km", importance=0.05),
|
||||
AVMv2FeatureImportance(feature="interior_quality", importance=0.05),
|
||||
AVMv2FeatureImportance(feature="price_momentum_30d", importance=0.04),
|
||||
AVMv2FeatureImportance(feature="area_m2", importance=0.14),
|
||||
AVMv2FeatureImportance(feature="avg_price_district_3m_vnd_m2", importance=0.12),
|
||||
AVMv2FeatureImportance(feature="neighborhood_score", importance=0.10),
|
||||
AVMv2FeatureImportance(feature="property_type_encoded", importance=0.10),
|
||||
AVMv2FeatureImportance(feature="distance_to_cbd_km", importance=0.08),
|
||||
AVMv2FeatureImportance(feature="developer_reputation", importance=0.07),
|
||||
AVMv2FeatureImportance(feature="renovation_score", importance=0.07),
|
||||
AVMv2FeatureImportance(feature="building_age_years", importance=0.06),
|
||||
AVMv2FeatureImportance(feature="direction_encoded", importance=0.05),
|
||||
AVMv2FeatureImportance(feature="floor_level", importance=0.05),
|
||||
AVMv2FeatureImportance(feature="has_legal_paper", importance=0.05),
|
||||
AVMv2FeatureImportance(feature="distance_to_metro_km", importance=0.04),
|
||||
AVMv2FeatureImportance(feature="interior_quality", importance=0.04),
|
||||
AVMv2FeatureImportance(feature="price_momentum_30d", importance=0.03),
|
||||
]
|
||||
|
||||
return AVMv2PredictResponse(
|
||||
@@ -481,52 +547,455 @@ class AVMv2EnsembleService:
|
||||
# ── Training pipeline ───────────────────────────────────────
|
||||
|
||||
def train(self, req: AVMv2TrainRequest) -> AVMv2TrainResponse:
|
||||
"""Train the ensemble models.
|
||||
"""Train the ensemble models on available data.
|
||||
|
||||
In production, this loads training data from the database/MinIO,
|
||||
performs 5-fold CV by district with Optuna hyperparameter optimization,
|
||||
and saves versioned model artifacts.
|
||||
|
||||
Currently returns a scaffold response. Real training requires
|
||||
the data pipeline from Phase 3.
|
||||
Pipeline:
|
||||
1. Load training data from CSV/database export
|
||||
2. Feature engineering (encode, normalize, cyclical)
|
||||
3. Train/val/test split stratified by district
|
||||
4. For each model: Optuna hyperparameter optimization
|
||||
5. Save versioned artifacts + register in model registry
|
||||
"""
|
||||
from app.config import settings
|
||||
|
||||
version = f"ensemble-v2-{datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S')}"
|
||||
logger.info("Training AVM v2 ensemble — version %s, trials=%d", version, req.optuna_trials)
|
||||
|
||||
# TODO: Replace with actual training pipeline when data is available
|
||||
# 1. Load data from PostgreSQL/MinIO
|
||||
# 2. Feature engineering (encode categoricals, normalize, cyclical)
|
||||
# 3. 80/10/10 split stratified by district
|
||||
# 4. For each model (XGBoost, LightGBM, CatBoost):
|
||||
# a. Optuna study with req.optuna_trials trials
|
||||
# b. 5-fold CV grouped by district
|
||||
# c. Train on best params
|
||||
# 5. Save artifacts to MinIO with version tag
|
||||
# 6. Register in model registry
|
||||
model_dir = Path(settings.model_path)
|
||||
data_path = model_dir / "training_data.csv"
|
||||
|
||||
# Check for training data
|
||||
if not data_path.exists():
|
||||
logger.warning("No training data found at %s — returning scaffold", data_path)
|
||||
return AVMv2TrainResponse(
|
||||
model_version=version,
|
||||
metrics={"mae": 0.0, "mape": 0.0, "rmse": 0.0, "r2": 0.0},
|
||||
district_metrics={},
|
||||
training_samples=0,
|
||||
validation_samples=0,
|
||||
test_samples=0,
|
||||
best_params={},
|
||||
)
|
||||
|
||||
# Load and prepare data
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import GroupShuffleSplit
|
||||
|
||||
df = pd.read_csv(data_path)
|
||||
logger.info("Loaded %d training samples", len(df))
|
||||
|
||||
# Feature engineering
|
||||
X, y, groups = self._prepare_training_data(df)
|
||||
if len(X) < 50:
|
||||
logger.warning("Insufficient training data (%d samples)", len(X))
|
||||
return AVMv2TrainResponse(
|
||||
model_version=version,
|
||||
metrics={"mae": 0.0, "mape": 0.0, "rmse": 0.0, "r2": 0.0},
|
||||
district_metrics={},
|
||||
training_samples=len(X),
|
||||
validation_samples=0,
|
||||
test_samples=0,
|
||||
best_params={},
|
||||
)
|
||||
|
||||
# Split: train/val/test grouped by district
|
||||
gss_test = GroupShuffleSplit(n_splits=1, test_size=req.test_size, random_state=42)
|
||||
train_val_idx, test_idx = next(gss_test.split(X, y, groups))
|
||||
X_trainval, y_trainval = X[train_val_idx], y[train_val_idx]
|
||||
X_test, y_test = X[test_idx], y[test_idx]
|
||||
groups_trainval = groups[train_val_idx]
|
||||
|
||||
val_ratio = req.val_size / (1.0 - req.test_size)
|
||||
gss_val = GroupShuffleSplit(n_splits=1, test_size=val_ratio, random_state=42)
|
||||
train_idx, val_idx = next(gss_val.split(X_trainval, y_trainval, groups_trainval))
|
||||
X_train, y_train = X_trainval[train_idx], y_trainval[train_idx]
|
||||
X_val, y_val = X_trainval[val_idx], y_trainval[val_idx]
|
||||
|
||||
logger.info("Split: train=%d, val=%d, test=%d", len(X_train), len(X_val), len(X_test))
|
||||
|
||||
# Train each model with Optuna
|
||||
best_params: dict[str, dict] = {}
|
||||
trained_models: dict[str, Any] = {}
|
||||
|
||||
xgb_params, xgb_model = self._train_xgboost(X_train, y_train, X_val, y_val, req.optuna_trials)
|
||||
if xgb_model:
|
||||
best_params["xgboost"] = xgb_params
|
||||
trained_models["xgboost"] = xgb_model
|
||||
|
||||
lgb_params, lgb_model = self._train_lightgbm(X_train, y_train, X_val, y_val, req.optuna_trials)
|
||||
if lgb_model:
|
||||
best_params["lightgbm"] = lgb_params
|
||||
trained_models["lightgbm"] = lgb_model
|
||||
|
||||
cat_params, cat_model = self._train_catboost(X_train, y_train, X_val, y_val, req.optuna_trials)
|
||||
if cat_model:
|
||||
best_params["catboost"] = cat_params
|
||||
trained_models["catboost"] = cat_model
|
||||
|
||||
# Evaluate ensemble on test set
|
||||
metrics = self._evaluate_ensemble(trained_models, X_test, y_test)
|
||||
|
||||
# Save versioned artifacts
|
||||
version_dir = model_dir / "versions" / version
|
||||
version_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for name, model in trained_models.items():
|
||||
self._save_model(name, model, version_dir)
|
||||
# Also save to active model directory
|
||||
self._save_model(name, model, model_dir)
|
||||
|
||||
# Register in model registry
|
||||
registry_entry = AVMv2ModelInfo(
|
||||
model_version=version,
|
||||
created_at=datetime.now(timezone.utc).isoformat(),
|
||||
metrics=metrics,
|
||||
is_active=True,
|
||||
ab_test_traffic_pct=0.0,
|
||||
)
|
||||
self._register_model(registry_entry, model_dir)
|
||||
|
||||
# Reload models
|
||||
self._models = trained_models
|
||||
self._model_version = version
|
||||
|
||||
return AVMv2TrainResponse(
|
||||
model_version=version,
|
||||
metrics={
|
||||
"mae": 0.0,
|
||||
"mape": 0.0,
|
||||
"rmse": 0.0,
|
||||
"r2": 0.0,
|
||||
},
|
||||
metrics=metrics,
|
||||
district_metrics={},
|
||||
training_samples=0,
|
||||
validation_samples=0,
|
||||
test_samples=0,
|
||||
best_params={
|
||||
"xgboost": {"n_estimators": 500, "max_depth": 6, "learning_rate": 0.05},
|
||||
"lightgbm": {"n_estimators": 500, "num_leaves": 31, "learning_rate": 0.05},
|
||||
"catboost": {"iterations": 500, "depth": 6, "learning_rate": 0.05},
|
||||
},
|
||||
training_samples=len(X_train),
|
||||
validation_samples=len(X_val),
|
||||
test_samples=len(X_test),
|
||||
best_params=best_params,
|
||||
)
|
||||
|
||||
def _prepare_training_data(
|
||||
self, df: "pd.DataFrame"
|
||||
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""Encode a DataFrame into feature matrix, target vector, and group labels."""
|
||||
import pandas as pd # noqa: F811
|
||||
|
||||
feature_cols = [
|
||||
"distance_to_cbd_km", "distance_to_metro_km", "distance_to_school_km",
|
||||
"distance_to_hospital_km", "distance_to_park_km", "distance_to_mall_km",
|
||||
"flood_zone_risk", "neighborhood_score",
|
||||
"property_type", "area_m2", "rooms", "floor_level", "total_floors",
|
||||
"direction", "floor_ratio", "building_age_years",
|
||||
"has_elevator", "has_parking", "has_pool", "has_legal_paper",
|
||||
"developer_reputation",
|
||||
"avg_price_district_3m_vnd_m2", "listing_density", "absorption_rate",
|
||||
"dom_avg", "price_momentum_30d", "yoy_change",
|
||||
"renovation_score", "view_quality", "interior_quality",
|
||||
"noise_level", "natural_light",
|
||||
"month",
|
||||
]
|
||||
|
||||
# Fill missing columns with defaults
|
||||
for col in feature_cols:
|
||||
if col not in df.columns:
|
||||
df[col] = 0.0 if col not in ("property_type", "direction") else "unknown"
|
||||
|
||||
# Encode categoricals
|
||||
df["property_type_encoded"] = df["property_type"].str.lower().map(PROPERTY_TYPE_MAP).fillna(1)
|
||||
df["direction_encoded"] = df["direction"].str.lower().map(DIRECTION_MAP).fillna(4)
|
||||
|
||||
# Cyclical month encoding
|
||||
month_rad = 2 * np.pi * df["month"].astype(float) / 12.0
|
||||
df["month_sin"] = np.sin(month_rad)
|
||||
df["month_cos"] = np.cos(month_rad)
|
||||
df["is_year_end_encoded"] = (df["month"].astype(int).isin([10, 11, 12])).astype(float)
|
||||
|
||||
# Boolean encoding
|
||||
for col in ["has_elevator", "has_parking", "has_pool", "has_legal_paper"]:
|
||||
df[col] = df[col].astype(float)
|
||||
|
||||
encoded_feature_cols = [
|
||||
"distance_to_cbd_km", "distance_to_metro_km", "distance_to_school_km",
|
||||
"distance_to_hospital_km", "distance_to_park_km", "distance_to_mall_km",
|
||||
"flood_zone_risk", "neighborhood_score",
|
||||
"property_type_encoded", "area_m2", "rooms", "floor_level", "total_floors",
|
||||
"direction_encoded", "floor_ratio", "building_age_years",
|
||||
"has_elevator", "has_parking", "has_pool", "has_legal_paper",
|
||||
"developer_reputation",
|
||||
"avg_price_district_3m_vnd_m2", "listing_density", "absorption_rate",
|
||||
"dom_avg", "price_momentum_30d", "yoy_change",
|
||||
"renovation_score", "view_quality", "interior_quality",
|
||||
"noise_level", "natural_light",
|
||||
"month_sin", "month_cos", "is_year_end_encoded",
|
||||
]
|
||||
|
||||
X = df[encoded_feature_cols].values.astype(np.float64)
|
||||
y = np.log(df["price_vnd"].values.astype(np.float64)) # Log-price target
|
||||
groups = df.get("district", pd.Series(["default"] * len(df))).values
|
||||
|
||||
return X, y, groups
|
||||
|
||||
def _train_xgboost(
|
||||
self,
|
||||
X_train: np.ndarray, y_train: np.ndarray,
|
||||
X_val: np.ndarray, y_val: np.ndarray,
|
||||
n_trials: int,
|
||||
) -> tuple[dict, Any]:
|
||||
"""Train XGBoost with Optuna hyperparameter optimization."""
|
||||
try:
|
||||
import optuna
|
||||
import xgboost as xgb
|
||||
|
||||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||||
|
||||
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=FEATURE_NAMES)
|
||||
dval = xgb.DMatrix(X_val, label=y_val, feature_names=FEATURE_NAMES)
|
||||
|
||||
def objective(trial: optuna.Trial) -> float:
|
||||
params = {
|
||||
"objective": "reg:squarederror",
|
||||
"eval_metric": "rmse",
|
||||
"max_depth": trial.suggest_int("max_depth", 3, 10),
|
||||
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
|
||||
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
|
||||
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
|
||||
"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
|
||||
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
|
||||
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
|
||||
"verbosity": 0,
|
||||
}
|
||||
n_rounds = trial.suggest_int("n_rounds", 100, 1000)
|
||||
model = xgb.train(
|
||||
params, dtrain, num_boost_round=n_rounds,
|
||||
evals=[(dval, "val")], verbose_eval=False,
|
||||
early_stopping_rounds=50,
|
||||
)
|
||||
preds = model.predict(dval)
|
||||
rmse = float(np.sqrt(np.mean((preds - y_val) ** 2)))
|
||||
return rmse
|
||||
|
||||
study = optuna.create_study(direction="minimize")
|
||||
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
|
||||
|
||||
# Retrain with best params on full train set
|
||||
best = study.best_params
|
||||
n_rounds = best.pop("n_rounds", 500)
|
||||
best.update({"objective": "reg:squarederror", "eval_metric": "rmse", "verbosity": 0})
|
||||
model = xgb.train(
|
||||
best, dtrain, num_boost_round=n_rounds,
|
||||
evals=[(dval, "val")], verbose_eval=False,
|
||||
early_stopping_rounds=50,
|
||||
)
|
||||
logger.info("XGBoost trained — best RMSE: %.4f", study.best_value)
|
||||
return best, model
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("XGBoost training failed: %s", e)
|
||||
return {}, None
|
||||
|
||||
def _train_lightgbm(
|
||||
self,
|
||||
X_train: np.ndarray, y_train: np.ndarray,
|
||||
X_val: np.ndarray, y_val: np.ndarray,
|
||||
n_trials: int,
|
||||
) -> tuple[dict, Any]:
|
||||
"""Train LightGBM with Optuna hyperparameter optimization."""
|
||||
try:
|
||||
import lightgbm as lgb
|
||||
import optuna
|
||||
|
||||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||||
|
||||
dtrain = lgb.Dataset(X_train, label=y_train, feature_name=FEATURE_NAMES)
|
||||
dval = lgb.Dataset(X_val, label=y_val, feature_name=FEATURE_NAMES, reference=dtrain)
|
||||
|
||||
def objective(trial: optuna.Trial) -> float:
|
||||
params = {
|
||||
"objective": "regression",
|
||||
"metric": "rmse",
|
||||
"num_leaves": trial.suggest_int("num_leaves", 15, 127),
|
||||
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
|
||||
"feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
|
||||
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
|
||||
"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
|
||||
"min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
|
||||
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
|
||||
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
|
||||
"verbosity": -1,
|
||||
}
|
||||
n_rounds = trial.suggest_int("n_rounds", 100, 1000)
|
||||
callbacks = [lgb.early_stopping(50, verbose=False), lgb.log_evaluation(period=0)]
|
||||
model = lgb.train(
|
||||
params, dtrain, num_boost_round=n_rounds,
|
||||
valid_sets=[dval], callbacks=callbacks,
|
||||
)
|
||||
preds = model.predict(X_val)
|
||||
rmse = float(np.sqrt(np.mean((preds - y_val) ** 2)))
|
||||
return rmse
|
||||
|
||||
study = optuna.create_study(direction="minimize")
|
||||
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
|
||||
|
||||
best = study.best_params
|
||||
n_rounds = best.pop("n_rounds", 500)
|
||||
best.update({"objective": "regression", "metric": "rmse", "verbosity": -1})
|
||||
callbacks = [lgb.early_stopping(50, verbose=False), lgb.log_evaluation(period=0)]
|
||||
model = lgb.train(
|
||||
best, dtrain, num_boost_round=n_rounds,
|
||||
valid_sets=[dval], callbacks=callbacks,
|
||||
)
|
||||
logger.info("LightGBM trained — best RMSE: %.4f", study.best_value)
|
||||
return best, model
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("LightGBM training failed: %s", e)
|
||||
return {}, None
|
||||
|
||||
def _train_catboost(
|
||||
self,
|
||||
X_train: np.ndarray, y_train: np.ndarray,
|
||||
X_val: np.ndarray, y_val: np.ndarray,
|
||||
n_trials: int,
|
||||
) -> tuple[dict, Any]:
|
||||
"""Train CatBoost with Optuna hyperparameter optimization."""
|
||||
try:
|
||||
import optuna
|
||||
from catboost import CatBoostRegressor, Pool
|
||||
|
||||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||||
|
||||
train_pool = Pool(X_train, label=y_train, feature_names=FEATURE_NAMES)
|
||||
val_pool = Pool(X_val, label=y_val, feature_names=FEATURE_NAMES)
|
||||
|
||||
def objective(trial: optuna.Trial) -> float:
|
||||
params = {
|
||||
"iterations": trial.suggest_int("iterations", 100, 1000),
|
||||
"depth": trial.suggest_int("depth", 4, 10),
|
||||
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
|
||||
"l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
|
||||
"bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
|
||||
"random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
|
||||
"verbose": 0,
|
||||
"loss_function": "RMSE",
|
||||
"early_stopping_rounds": 50,
|
||||
}
|
||||
model = CatBoostRegressor(**params)
|
||||
model.fit(train_pool, eval_set=val_pool, verbose=0)
|
||||
preds = model.predict(val_pool)
|
||||
rmse = float(np.sqrt(np.mean((preds - y_val) ** 2)))
|
||||
return rmse
|
||||
|
||||
study = optuna.create_study(direction="minimize")
|
||||
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
|
||||
|
||||
best = study.best_params
|
||||
best.update({"verbose": 0, "loss_function": "RMSE", "early_stopping_rounds": 50})
|
||||
model = CatBoostRegressor(**best)
|
||||
model.fit(train_pool, eval_set=val_pool, verbose=0)
|
||||
logger.info("CatBoost trained — best RMSE: %.4f", study.best_value)
|
||||
return best, model
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("CatBoost training failed: %s", e)
|
||||
return {}, None
|
||||
|
||||
def _evaluate_ensemble(
|
||||
self, models: dict[str, Any], X_test: np.ndarray, y_test: np.ndarray
|
||||
) -> dict:
|
||||
"""Evaluate ensemble performance on a test set."""
|
||||
if not models:
|
||||
return {"mae": 0.0, "mape": 0.0, "rmse": 0.0, "r2": 0.0}
|
||||
|
||||
predictions = []
|
||||
weights = []
|
||||
for name, model in models.items():
|
||||
w = ENSEMBLE_WEIGHTS.get(name, 0.0)
|
||||
features = X_test
|
||||
if name == "xgboost":
|
||||
import xgboost as xgb
|
||||
preds = model.predict(xgb.DMatrix(features, feature_names=FEATURE_NAMES))
|
||||
elif name == "lightgbm":
|
||||
preds = model.predict(features)
|
||||
elif name == "catboost":
|
||||
preds = model.predict(features)
|
||||
else:
|
||||
continue
|
||||
predictions.append(preds * w)
|
||||
weights.append(w)
|
||||
|
||||
total_weight = sum(weights) or 1.0
|
||||
ensemble_preds = sum(predictions) / total_weight
|
||||
|
||||
# Metrics in log-space then convert
|
||||
y_actual = np.exp(y_test)
|
||||
y_pred = np.exp(ensemble_preds)
|
||||
|
||||
mae = float(np.mean(np.abs(y_actual - y_pred)))
|
||||
mape = float(np.mean(np.abs((y_actual - y_pred) / y_actual))) * 100
|
||||
rmse = float(np.sqrt(np.mean((y_actual - y_pred) ** 2)))
|
||||
ss_res = np.sum((y_actual - y_pred) ** 2)
|
||||
ss_tot = np.sum((y_actual - np.mean(y_actual)) ** 2)
|
||||
r2 = float(1.0 - ss_res / ss_tot) if ss_tot > 0 else 0.0
|
||||
|
||||
return {
|
||||
"mae": round(mae, 2),
|
||||
"mape": round(mape, 2),
|
||||
"rmse": round(rmse, 2),
|
||||
"r2": round(r2, 4),
|
||||
}
|
||||
|
||||
def _save_model(self, name: str, model: Any, directory: Path) -> None:
|
||||
"""Save a trained model to the specified directory."""
|
||||
if name == "xgboost":
|
||||
model.save_model(str(directory / "avm_v2_xgboost.json"))
|
||||
elif name == "lightgbm":
|
||||
model.save_model(str(directory / "avm_v2_lightgbm.txt"))
|
||||
elif name == "catboost":
|
||||
model.save_model(str(directory / "avm_v2_catboost.cbm"))
|
||||
|
||||
# ── Model registry ──────────────────────────────────────────
|
||||
|
||||
def _get_registry_path(self, model_dir: Path | None = None) -> Path:
|
||||
"""Get the path to the model registry JSON file."""
|
||||
if model_dir is None:
|
||||
from app.config import settings
|
||||
model_dir = Path(settings.model_path)
|
||||
return model_dir / "model_registry.json"
|
||||
|
||||
def _load_registry(self, model_dir: Path | None = None) -> list[dict]:
|
||||
"""Load the model registry from disk."""
|
||||
path = self._get_registry_path(model_dir)
|
||||
if path.exists():
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
return []
|
||||
|
||||
def _save_registry(self, entries: list[dict], model_dir: Path | None = None) -> None:
|
||||
"""Save the model registry to disk."""
|
||||
path = self._get_registry_path(model_dir)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w") as f:
|
||||
json.dump(entries, f, indent=2)
|
||||
|
||||
def _register_model(self, info: AVMv2ModelInfo, model_dir: Path) -> None:
|
||||
"""Register a new model version and mark it as active."""
|
||||
entries = self._load_registry(model_dir)
|
||||
# Deactivate previous active models
|
||||
for entry in entries:
|
||||
entry["is_active"] = False
|
||||
entries.append({
|
||||
"model_version": info.model_version,
|
||||
"created_at": info.created_at,
|
||||
"metrics": info.metrics,
|
||||
"is_active": True,
|
||||
"ab_test_traffic_pct": info.ab_test_traffic_pct,
|
||||
})
|
||||
self._save_registry(entries, model_dir)
|
||||
self._model_registry = [
|
||||
AVMv2ModelInfo(**e) for e in entries
|
||||
]
|
||||
|
||||
def get_model_info(self) -> AVMv2ModelInfo:
|
||||
"""Return current active model information."""
|
||||
# Check registry for active model
|
||||
entries = self._load_registry()
|
||||
for entry in reversed(entries):
|
||||
if entry.get("is_active"):
|
||||
return AVMv2ModelInfo(**entry)
|
||||
return AVMv2ModelInfo(
|
||||
model_version=self._model_version,
|
||||
created_at=datetime.now(timezone.utc).isoformat(),
|
||||
@@ -535,6 +1004,52 @@ class AVMv2EnsembleService:
|
||||
ab_test_traffic_pct=0.0,
|
||||
)
|
||||
|
||||
def list_versions(self) -> list[AVMv2ModelInfo]:
|
||||
"""List all registered model versions."""
|
||||
entries = self._load_registry()
|
||||
return [AVMv2ModelInfo(**e) for e in entries]
|
||||
|
||||
def rollback(self, target_version: str) -> AVMv2ModelInfo:
|
||||
"""Rollback to a previously trained model version.
|
||||
|
||||
Copies the target version's artifacts to the active model directory
|
||||
and updates the registry.
|
||||
"""
|
||||
from app.config import settings
|
||||
|
||||
model_dir = Path(settings.model_path)
|
||||
version_dir = model_dir / "versions" / target_version
|
||||
|
||||
if not version_dir.exists():
|
||||
raise ValueError(f"Model version {target_version} not found")
|
||||
|
||||
# Copy versioned artifacts to active directory
|
||||
for artifact in version_dir.iterdir():
|
||||
if artifact.is_file():
|
||||
shutil.copy2(artifact, model_dir / artifact.name)
|
||||
|
||||
# Update registry
|
||||
entries = self._load_registry(model_dir)
|
||||
found = False
|
||||
for entry in entries:
|
||||
entry["is_active"] = entry["model_version"] == target_version
|
||||
if entry["model_version"] == target_version:
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
raise ValueError(f"Model version {target_version} not in registry")
|
||||
|
||||
self._save_registry(entries, model_dir)
|
||||
|
||||
# Reload models from disk
|
||||
self._models = {}
|
||||
self._load_models()
|
||||
self._model_version = target_version
|
||||
|
||||
logger.info("Rolled back to model version %s", target_version)
|
||||
active = next(e for e in entries if e["is_active"])
|
||||
return AVMv2ModelInfo(**active)
|
||||
|
||||
# ── A/B comparison ─────────────────────────────────────────
|
||||
|
||||
def compare_v1(self, req: ABComparisonRequest) -> ABComparisonResponse:
|
||||
@@ -562,13 +1077,18 @@ class AVMv2EnsembleService:
|
||||
area_m2=req.area_m2,
|
||||
rooms=req.rooms or req.bedrooms,
|
||||
has_legal_paper=req.has_legal_paper,
|
||||
neighborhood_score=req.neighborhood_score,
|
||||
distance_to_cbd_km=req.distance_to_cbd_km,
|
||||
distance_to_metro_km=req.distance_to_metro_km,
|
||||
flood_zone_risk=req.flood_zone_risk,
|
||||
building_age_years=req.building_age_years,
|
||||
floor_level=req.floor_level,
|
||||
total_floors=req.total_floors,
|
||||
direction=req.direction,
|
||||
has_elevator=req.has_elevator,
|
||||
has_parking=req.has_parking,
|
||||
has_pool=req.has_pool,
|
||||
developer_reputation=req.developer_reputation,
|
||||
renovation_score=req.renovation_score,
|
||||
view_quality=req.view_quality,
|
||||
interior_quality=req.interior_quality,
|
||||
|
||||
@@ -65,9 +65,10 @@ def test_predict_v2_returns_drivers():
|
||||
|
||||
|
||||
def test_predict_v2_with_full_features():
|
||||
"""Predict with all features populated."""
|
||||
"""Predict with all features populated (including new v2 features)."""
|
||||
payload = {
|
||||
**_PREDICT_PAYLOAD,
|
||||
"neighborhood_score": 0.8,
|
||||
"distance_to_cbd_km": 5.0,
|
||||
"distance_to_metro_km": 0.8,
|
||||
"distance_to_school_km": 0.5,
|
||||
@@ -75,11 +76,15 @@ def test_predict_v2_with_full_features():
|
||||
"distance_to_park_km": 0.3,
|
||||
"distance_to_mall_km": 1.0,
|
||||
"flood_zone_risk": 0.1,
|
||||
"floor_level": 12,
|
||||
"total_floors": 25,
|
||||
"direction": "southeast",
|
||||
"floor_ratio": 1.2,
|
||||
"building_age_years": 5,
|
||||
"has_elevator": True,
|
||||
"has_parking": True,
|
||||
"has_pool": False,
|
||||
"developer_reputation": 0.9,
|
||||
"avg_price_district_3m_vnd_m2": 85_000_000,
|
||||
"listing_density": 12.5,
|
||||
"absorption_rate": 0.3,
|
||||
@@ -149,8 +154,93 @@ def test_predict_v2_invalid_area():
|
||||
assert resp.status_code == 422
|
||||
|
||||
|
||||
def test_train_v2_scaffold():
|
||||
"""Training endpoint should return scaffold response."""
|
||||
# ── New v2 features: neighborhood, floor, direction, developer ──
|
||||
|
||||
|
||||
def test_predict_v2_neighborhood_premium():
|
||||
"""High neighborhood score should increase price."""
|
||||
low_nb = client.post(
|
||||
"/avm/v2/predict",
|
||||
json={**_PREDICT_PAYLOAD, "neighborhood_score": 0.2},
|
||||
).json()
|
||||
high_nb = client.post(
|
||||
"/avm/v2/predict",
|
||||
json={**_PREDICT_PAYLOAD, "neighborhood_score": 0.9},
|
||||
).json()
|
||||
|
||||
assert high_nb["estimated_price_vnd"] > low_nb["estimated_price_vnd"]
|
||||
|
||||
|
||||
def test_predict_v2_floor_level_premium():
|
||||
"""Higher floor apartments should command a premium."""
|
||||
ground = client.post(
|
||||
"/avm/v2/predict",
|
||||
json={**_PREDICT_PAYLOAD, "floor_level": 2, "total_floors": 25},
|
||||
).json()
|
||||
high = client.post(
|
||||
"/avm/v2/predict",
|
||||
json={**_PREDICT_PAYLOAD, "floor_level": 20, "total_floors": 25},
|
||||
).json()
|
||||
|
||||
assert high["estimated_price_vnd"] > ground["estimated_price_vnd"]
|
||||
|
||||
|
||||
def test_predict_v2_direction_premium():
|
||||
"""South-facing properties should be priced higher than north-facing."""
|
||||
south = client.post(
|
||||
"/avm/v2/predict",
|
||||
json={**_PREDICT_PAYLOAD, "direction": "south"},
|
||||
).json()
|
||||
north = client.post(
|
||||
"/avm/v2/predict",
|
||||
json={**_PREDICT_PAYLOAD, "direction": "north"},
|
||||
).json()
|
||||
|
||||
assert south["estimated_price_vnd"] > north["estimated_price_vnd"]
|
||||
|
||||
|
||||
def test_predict_v2_developer_reputation():
|
||||
"""Properties from reputable developers should be valued higher."""
|
||||
low_rep = client.post(
|
||||
"/avm/v2/predict",
|
||||
json={**_PREDICT_PAYLOAD, "developer_reputation": 0.2},
|
||||
).json()
|
||||
high_rep = client.post(
|
||||
"/avm/v2/predict",
|
||||
json={**_PREDICT_PAYLOAD, "developer_reputation": 0.9},
|
||||
).json()
|
||||
|
||||
assert high_rep["estimated_price_vnd"] > low_rep["estimated_price_vnd"]
|
||||
|
||||
|
||||
def test_predict_v2_direction_defaults_unknown():
|
||||
"""Unknown direction should not affect price (neutral)."""
|
||||
explicit = client.post(
|
||||
"/avm/v2/predict",
|
||||
json={**_PREDICT_PAYLOAD, "direction": "unknown"},
|
||||
).json()
|
||||
default = client.post("/avm/v2/predict", json=_PREDICT_PAYLOAD).json()
|
||||
|
||||
assert explicit["estimated_price_vnd"] == default["estimated_price_vnd"]
|
||||
|
||||
|
||||
def test_predict_v2_drivers_include_new_features():
|
||||
"""Drivers should include neighborhood_score, direction, floor_level."""
|
||||
resp = client.post("/avm/v2/predict", json=_PREDICT_PAYLOAD)
|
||||
data = resp.json()
|
||||
driver_names = {d["feature"] for d in data["drivers"]}
|
||||
|
||||
assert "neighborhood_score" in driver_names
|
||||
assert "direction_encoded" in driver_names
|
||||
assert "floor_level" in driver_names
|
||||
assert "developer_reputation" in driver_names
|
||||
|
||||
|
||||
# ── Training & model info ───────────────────────────────────────
|
||||
|
||||
|
||||
def test_train_v2_no_data():
|
||||
"""Training without data returns scaffold with zero metrics."""
|
||||
resp = client.post(
|
||||
"/avm/v2/train",
|
||||
json={"optuna_trials": 10},
|
||||
@@ -159,10 +249,7 @@ def test_train_v2_scaffold():
|
||||
data = resp.json()
|
||||
assert "model_version" in data
|
||||
assert "ensemble-v2-" in data["model_version"]
|
||||
assert data["metrics"]["mae"] == 0.0 # scaffold returns zeros
|
||||
assert "xgboost" in data["best_params"]
|
||||
assert "lightgbm" in data["best_params"]
|
||||
assert "catboost" in data["best_params"]
|
||||
assert data["training_samples"] == 0
|
||||
|
||||
|
||||
def test_model_info_v2():
|
||||
@@ -174,6 +261,26 @@ def test_model_info_v2():
|
||||
assert data["is_active"] is True
|
||||
|
||||
|
||||
# ── Model versioning ────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_list_versions():
|
||||
"""Versions endpoint returns a list."""
|
||||
resp = client.get("/avm/v2/versions")
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert isinstance(data, list)
|
||||
|
||||
|
||||
def test_rollback_not_found():
|
||||
"""Rollback to non-existent version returns 404."""
|
||||
resp = client.post(
|
||||
"/avm/v2/rollback",
|
||||
json={"target_version": "nonexistent-version-xyz"},
|
||||
)
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
# ── A/B comparison tests ─────────────────────────────────────
|
||||
|
||||
_COMPARE_PAYLOAD = {
|
||||
@@ -227,12 +334,17 @@ def test_compare_v1_with_v2_features():
|
||||
"""Compare endpoint passes v2-specific features correctly."""
|
||||
payload = {
|
||||
**_COMPARE_PAYLOAD,
|
||||
"neighborhood_score": 0.8,
|
||||
"distance_to_cbd_km": 5.0,
|
||||
"distance_to_metro_km": 0.8,
|
||||
"flood_zone_risk": 0.1,
|
||||
"building_age_years": 3,
|
||||
"floor_level": 15,
|
||||
"total_floors": 30,
|
||||
"direction": "southeast",
|
||||
"has_elevator": True,
|
||||
"has_parking": True,
|
||||
"developer_reputation": 0.85,
|
||||
"renovation_score": 0.9,
|
||||
"view_quality": 0.8,
|
||||
"interior_quality": 0.85,
|
||||
|
||||
Reference in New Issue
Block a user