feat: implement project development module, transfer management features, and industrial AVM model integration

This commit is contained in:
Ho Ngoc Hai
2026-04-18 20:34:35 +07:00
parent 0f3b4d7b0d
commit 38b9def99a
66 changed files with 9051 additions and 17 deletions

View File

@@ -1,12 +1,13 @@
"""Industrial AVM — Rent estimation service for industrial parks.
Heuristic fallback when trained models are not available.
Uses gradient boosting approach similar to residential AVM v2.
Preference order: park-level ridge baseline (v1, TEC-2768) → XGBoost → heuristic.
Heuristic fallback remains when no trained artifact is on disk.
"""
import logging
import math
import os
from datetime import datetime, timezone
import pickle
from typing import Any
import numpy as np
@@ -20,6 +21,21 @@ from app.models.avm_industrial import (
logger = logging.getLogger(__name__)
RIDGE_ARTIFACT_NAME = "avm_industrial_park_ridge_v1.pkl"
# Map API property types to the rent head trained in the ridge baseline.
# Land rent is stored as USD/m²/year; others as USD/m²/month — convert where
# needed so the response stays in USD/m²/month.
_PROPERTY_TO_HEAD: dict[str, str] = {
"warehouse": "rbw",
"ready_built_warehouse": "rbw",
"factory": "rbf",
"ready_built_factory": "rbf",
"office_in_park": "rbf",
"open_yard": "land",
"industrial_land": "land",
}
# ── Feature ordering for model input ────────────────────────────
INDUSTRIAL_FEATURE_NAMES = [
"region_encoded",
@@ -169,40 +185,171 @@ def _find_comparables(req: IndustrialAVMRequest) -> list[IndustrialComparable]:
class IndustrialAVMService:
"""Industrial property rent estimation service.
Uses gradient boosting when a trained model is available,
falls back to heuristic pricing for development/demo.
Preference order when a trained artifact is available:
1. Ridge v1 (park-level baseline with conformal CIs, TEC-2768)
2. XGBoost (legacy, listing-level)
3. Multi-factor heuristic (always available)
"""
def __init__(self) -> None:
self._model: Any = None
self._model_version = "heuristic-v1"
self._backend: str = "heuristic"
self._load_model()
def _load_model(self) -> None:
"""Attempt to load trained industrial AVM model."""
"""Attempt to load trained industrial AVM artifacts (ridge first)."""
try:
from app.config import settings
model_path = settings.model_path
except Exception:
logger.info("Industrial AVM: config unavailable — using heuristic")
return
ridge_path = os.path.join(model_path, RIDGE_ARTIFACT_NAME)
if os.path.exists(ridge_path):
try:
with open(ridge_path, "rb") as f:
artifact = pickle.load(f)
if not isinstance(artifact, dict) or artifact.get("version") != "ridge-industrial-v1":
raise ValueError(f"Unexpected artifact version in {ridge_path}")
self._model = artifact
self._model_version = "ridge-industrial-v1"
self._backend = "ridge"
logger.info("Loaded industrial AVM ridge artifact from %s", ridge_path)
return
except Exception as exc: # keep service alive on artifact corruption
logger.warning("Failed to load ridge artifact (%s); falling back", exc)
try:
import xgboost as xgb
from app.config import settings
path = os.path.join(settings.model_path, "avm_industrial_xgb.json")
if os.path.exists(path):
xgb_path = os.path.join(model_path, "avm_industrial_xgb.json")
if os.path.exists(xgb_path):
booster = xgb.Booster()
booster.load_model(path)
booster.load_model(xgb_path)
self._model = booster
self._model_version = "xgb-industrial-v1"
logger.info("Loaded industrial AVM model from %s", path)
else:
logger.info("No trained industrial AVM model — using heuristic")
self._backend = "xgb"
logger.info("Loaded industrial AVM xgb model from %s", xgb_path)
return
except Exception:
logger.info("Industrial AVM model not available — using heuristic")
pass
logger.info("No trained industrial AVM model — using heuristic")
def predict(self, req: IndustrialAVMRequest) -> IndustrialAVMResponse:
"""Predict industrial property rent."""
if self._model is not None:
if self._backend == "ridge":
return self._predict_ridge(req)
if self._backend == "xgb" and self._model is not None:
return self._predict_model(req)
return self._predict_heuristic(req)
def _featureize_ridge(self, req: IndustrialAVMRequest, spec: dict) -> np.ndarray:
"""Build the exact feature vector used during ridge training.
Feature ordering must match `spec["feature_cols"]` which is the canonical
order emitted by the trainer. Sources:
- numeric fields come straight from the request
- province FDI comes from the artifact lookup (fallback to default)
- target-industry flags approximate one-hots against top-6 list
"""
province = (req.province or "").strip().lower()
fdi = spec["province_fdi"].get(province, spec["default_fdi"])
occupancy = float(req.park_occupancy_rate)
if occupancy > 1.5:
occupancy = occupancy / 100.0
occupancy = min(max(occupancy, 0.0), 1.0)
feats: dict[str, float] = {
"occupancy": occupancy,
"log_area_ha": math.log1p(max(0.0, float(req.park_area_ha))),
"park_age_years": float(max(0, int(req.park_age_years))),
"log_dist_port_km": math.log1p(max(0.0, float(req.distance_to_port_km))),
"log_dist_airport_km": math.log1p(max(0.0, float(req.distance_to_airport_km))),
"log_dist_highway_km": math.log1p(max(0.0, float(req.distance_to_highway_km))),
"logistics_connectivity_score": float(req.logistics_connectivity_score),
"log_fdi_province": math.log1p(
max(0.0, float(req.fdi_province_musd) or fdi)
),
"has_special_zone": float(
req.zoning.lower() in {"free_trade_zone", "high_tech"}
),
}
# Property type flags can proxy certain target-industry signals but the
# trainer's industry one-hots are park-level. At inference we don't know
# the park's industry mix, so default to 0 and let the province/region
# fixed effects carry the signal.
for ind in spec["top_industries"]:
feats[f"ind_{ind}"] = 0.0
region = (req.region or "south").lower()
for r in spec["region_order"][1:]:
feats[f"region_{r}"] = float(region == r)
vec = np.array([feats[c] for c in spec["feature_cols"]], dtype=np.float64)
return vec
def _predict_ridge(self, req: IndustrialAVMRequest) -> IndustrialAVMResponse:
"""Predict using the ridge v1 park-level baseline (conformal CIs)."""
artifact = self._model
spec = artifact["feature_spec"]
x = self._featureize_ridge(req, spec)
head_name = _PROPERTY_TO_HEAD.get(req.property_type.lower(), "rbf")
head = artifact["heads"][head_name]
x_std = (x - head["scaler_mean"]) / np.where(
head["scaler_scale"] == 0, 1.0, head["scaler_scale"]
)
log_pred = float(x_std @ head["coefficients"] + head["intercept"])
q80 = float(head["q80_log"])
# Ridge head is trained in natural units (USD/m²/month for rbf/rbw,
# USD/m²/year for land). Convert to the response contract which always
# reports monthly USD/m² for the primary estimate.
rent_native = math.expm1(log_pred)
low_native = math.expm1(log_pred - q80)
high_native = math.expm1(log_pred + q80)
if head_name == "land":
rent = rent_native / 12.0
low = low_native / 12.0
high = high_native / 12.0
else:
rent = rent_native
low = low_native
high = high_native
comparables = _find_comparables(req)
# Drivers: top coefficients by absolute standardized contribution.
contrib = head["coefficients"] * x_std
order = np.argsort(-np.abs(contrib))[:8]
total = float(np.sum(np.abs(contrib))) or 1.0
drivers = [
FeatureImportance(
feature=head["feature_cols"][i],
importance=round(float(abs(contrib[i]) / total), 4),
)
for i in order
if abs(contrib[i]) > 1e-6
]
return IndustrialAVMResponse(
estimated_rent_usd_m2=round(max(0.0, rent), 2),
confidence=round(float(head.get("coverage_80_loo", 0.80)), 2),
rent_range_low_usd_m2=round(max(0.0, low), 2),
rent_range_high_usd_m2=round(max(0.0, high), 2),
annual_rent_usd_m2=round(max(0.0, rent) * 12, 2),
total_monthly_rent_usd=round(max(0.0, rent) * req.area_m2, 2),
comparables=comparables,
drivers=drivers,
model_version=self._model_version,
)
def _predict_model(self, req: IndustrialAVMRequest) -> IndustrialAVMResponse:
"""Predict using trained gradient boosting model."""
import xgboost as xgb

View File

@@ -0,0 +1,422 @@
[
{
"id": "seed-kcn-001",
"name": "KCN VSIP Bắc Ninh",
"slug": "vsip-bac-ninh",
"province": "Bắc Ninh",
"region": "north",
"status": "operational",
"totalAreaHa": 700,
"occupancyRate": 92,
"establishedYear": 2007,
"landRentUsdM2Year": 90,
"rbfRentUsdM2Month": 5.5,
"rbwRentUsdM2Month": 4.8,
"connectivity": {
"nearestPort": {"name": "Cảng Hải Phòng", "distanceKm": 110},
"airport": {"name": "Nội Bài", "distanceKm": 35},
"highway": {"name": "QL 1A", "distanceKm": 5}
},
"incentives": {"specialZone": false},
"targetIndustries": ["electronics", "automotive", "precision engineering", "food processing"]
},
{
"id": "seed-kcn-002",
"name": "KCN VSIP Bình Dương I",
"slug": "vsip-binh-duong-1",
"province": "Bình Dương",
"region": "south",
"status": "full",
"totalAreaHa": 500,
"occupancyRate": 100,
"establishedYear": 1996,
"landRentUsdM2Year": 110,
"rbfRentUsdM2Month": 6.0,
"rbwRentUsdM2Month": 5.2,
"connectivity": {
"nearestPort": {"name": "Cảng Cát Lái", "distanceKm": 25},
"airport": {"name": "Tân Sơn Nhất", "distanceKm": 20},
"highway": {"name": "ĐL Mỹ Phước - Tân Vạn", "distanceKm": 2}
},
"incentives": {"specialZone": false},
"targetIndustries": ["electronics", "garment", "food processing", "logistics"]
},
{
"id": "seed-kcn-003",
"name": "KCN Amata Đồng Nai",
"slug": "amata-dong-nai",
"province": "Đồng Nai",
"region": "south",
"status": "operational",
"totalAreaHa": 700,
"occupancyRate": 88,
"establishedYear": 1994,
"landRentUsdM2Year": 95,
"rbfRentUsdM2Month": 5.0,
"rbwRentUsdM2Month": 4.5,
"connectivity": {
"nearestPort": {"name": "Cảng Cát Lái", "distanceKm": 30},
"airport": {"name": "Long Thành", "distanceKm": 25},
"highway": {"name": "QL 1A", "distanceKm": 2}
},
"incentives": {"specialZone": false},
"targetIndustries": ["automotive", "electronics", "chemicals", "machinery"]
},
{
"id": "seed-kcn-004",
"name": "KCN Amata Long An",
"slug": "amata-long-an",
"province": "Long An",
"region": "south",
"status": "under_construction",
"totalAreaHa": 410,
"occupancyRate": 35,
"establishedYear": 2020,
"landRentUsdM2Year": 75,
"rbfRentUsdM2Month": 4.5,
"rbwRentUsdM2Month": 3.8,
"connectivity": {
"nearestPort": {"name": "Cảng Cát Lái", "distanceKm": 45},
"airport": {"name": "Tân Sơn Nhất", "distanceKm": 35},
"highway": {"name": "Vành đai 3 TP.HCM", "distanceKm": 8}
},
"incentives": {"specialZone": true},
"targetIndustries": ["logistics", "food processing", "consumer goods", "light manufacturing"]
},
{
"id": "seed-kcn-005",
"name": "KCN Nam Đình Vũ",
"slug": "nam-dinh-vu",
"province": "Hải Phòng",
"region": "north",
"status": "operational",
"totalAreaHa": 1329,
"occupancyRate": 75,
"establishedYear": 2014,
"landRentUsdM2Year": 80,
"rbfRentUsdM2Month": 4.8,
"rbwRentUsdM2Month": 4.0,
"connectivity": {
"nearestPort": {"name": "Cảng Đình Vũ", "distanceKm": 2},
"airport": {"name": "Cát Bi", "distanceKm": 15},
"highway": {"name": "Cao tốc Hà Nội - Hải Phòng", "distanceKm": 10}
},
"incentives": {"specialZone": true},
"targetIndustries": ["petrochemicals", "logistics", "heavy industry", "steel"]
},
{
"id": "seed-kcn-006",
"name": "KCN Long Hậu",
"slug": "long-hau",
"province": "Long An",
"region": "south",
"status": "operational",
"totalAreaHa": 311,
"occupancyRate": 85,
"establishedYear": 2006,
"landRentUsdM2Year": 85,
"rbfRentUsdM2Month": 4.5,
"rbwRentUsdM2Month": 3.8,
"connectivity": {
"nearestPort": {"name": "Cảng Hiệp Phước", "distanceKm": 5},
"airport": {"name": "Tân Sơn Nhất", "distanceKm": 25},
"highway": {"name": "Nguyễn Hữu Thọ", "distanceKm": 3}
},
"incentives": {"specialZone": false},
"targetIndustries": ["logistics", "food processing", "garment", "packaging"]
},
{
"id": "seed-kcn-007",
"name": "KCN Tân Thuận (EPZ)",
"slug": "tan-thuan-epz",
"province": "TP. Hồ Chí Minh",
"region": "south",
"status": "full",
"totalAreaHa": 300,
"occupancyRate": 100,
"establishedYear": 1991,
"landRentUsdM2Year": 130,
"rbfRentUsdM2Month": 7.0,
"rbwRentUsdM2Month": 6.0,
"connectivity": {
"nearestPort": {"name": "Cảng Cát Lái", "distanceKm": 15},
"airport": {"name": "Tân Sơn Nhất", "distanceKm": 12},
"highway": {"name": "Nguyễn Văn Linh", "distanceKm": 1}
},
"incentives": {"specialZone": true},
"targetIndustries": ["electronics", "precision engineering", "software", "export manufacturing"]
},
{
"id": "seed-kcn-008",
"name": "KCN Thăng Long",
"slug": "thang-long",
"province": "Hà Nội",
"region": "north",
"status": "full",
"totalAreaHa": 274,
"occupancyRate": 100,
"establishedYear": 1997,
"landRentUsdM2Year": 105,
"rbfRentUsdM2Month": 6.0,
"rbwRentUsdM2Month": 5.0,
"connectivity": {
"nearestPort": {"name": "Cảng Hải Phòng", "distanceKm": 120},
"airport": {"name": "Nội Bài", "distanceKm": 16},
"highway": {"name": "Nội Bài - Lào Cai", "distanceKm": 5}
},
"incentives": {"specialZone": false},
"targetIndustries": ["electronics", "automotive", "precision mechanics", "IT"]
},
{
"id": "seed-kcn-009",
"name": "KCN KTG Industrial Nhơn Trạch",
"slug": "ktg-nhon-trach",
"province": "Đồng Nai",
"region": "south",
"status": "operational",
"totalAreaHa": 250,
"occupancyRate": 78,
"establishedYear": 2018,
"landRentUsdM2Year": 80,
"rbfRentUsdM2Month": 4.8,
"rbwRentUsdM2Month": 4.0,
"connectivity": {
"nearestPort": {"name": "Cảng Cát Lái", "distanceKm": 20},
"airport": {"name": "Long Thành", "distanceKm": 15},
"highway": {"name": "Cao tốc Long Thành - Dầu Giây", "distanceKm": 5}
},
"incentives": {"specialZone": false},
"targetIndustries": ["logistics", "e-commerce fulfillment", "light manufacturing", "food processing"]
},
{
"id": "seed-kcn-010",
"name": "KCN Prodezi Nhơn Trạch",
"slug": "prodezi-nhon-trach",
"province": "Đồng Nai",
"region": "south",
"status": "operational",
"totalAreaHa": 340,
"occupancyRate": 70,
"establishedYear": 2015,
"landRentUsdM2Year": 72,
"rbfRentUsdM2Month": 4.2,
"rbwRentUsdM2Month": 3.5,
"connectivity": {
"nearestPort": {"name": "Cảng Cát Lái", "distanceKm": 25},
"airport": {"name": "Long Thành", "distanceKm": 12},
"highway": {"name": "QL 51", "distanceKm": 8}
},
"incentives": {"specialZone": false},
"targetIndustries": ["machinery", "plastics", "packaging", "consumer goods"]
},
{
"id": "seed-kcn-011",
"name": "KCN Thăng Long II Hưng Yên",
"slug": "thang-long-2-hung-yen",
"province": "Hưng Yên",
"region": "north",
"status": "operational",
"totalAreaHa": 345,
"occupancyRate": 82,
"establishedYear": 2004,
"landRentUsdM2Year": 78,
"rbfRentUsdM2Month": 4.5,
"rbwRentUsdM2Month": 3.8,
"connectivity": {
"nearestPort": {"name": "Cảng Hải Phòng", "distanceKm": 85},
"airport": {"name": "Nội Bài", "distanceKm": 50},
"highway": {"name": "QL 5", "distanceKm": 3}
},
"incentives": {"specialZone": false},
"targetIndustries": ["electronics", "automotive parts", "precision engineering"]
},
{
"id": "seed-kcn-012",
"name": "KCN Yên Phong Bắc Ninh",
"slug": "yen-phong-bac-ninh",
"province": "Bắc Ninh",
"region": "north",
"status": "operational",
"totalAreaHa": 658,
"occupancyRate": 95,
"establishedYear": 2008,
"landRentUsdM2Year": 85,
"rbfRentUsdM2Month": 5.0,
"rbwRentUsdM2Month": 4.2,
"connectivity": {
"nearestPort": {"name": "Cảng Hải Phòng", "distanceKm": 100},
"airport": {"name": "Nội Bài", "distanceKm": 30},
"highway": {"name": "QL 18", "distanceKm": 5}
},
"incentives": {"specialZone": false},
"targetIndustries": ["electronics", "display manufacturing", "semiconductors", "automotive"]
},
{
"id": "seed-kcn-013",
"name": "KCN Bà Rịa - Vũng Tàu",
"slug": "ba-ria-vung-tau",
"province": "Bà Rịa - Vũng Tàu",
"region": "south",
"status": "operational",
"totalAreaHa": 450,
"occupancyRate": 72,
"establishedYear": 2002,
"landRentUsdM2Year": 65,
"rbfRentUsdM2Month": 3.8,
"rbwRentUsdM2Month": 3.2,
"connectivity": {
"nearestPort": {"name": "Cảng Cái Mép - Thị Vải", "distanceKm": 20},
"airport": {"name": "Long Thành", "distanceKm": 50},
"highway": {"name": "Cao tốc Biên Hòa - Vũng Tàu", "distanceKm": 5}
},
"incentives": {"specialZone": true},
"targetIndustries": ["oil & gas", "petrochemicals", "heavy industry", "steel", "logistics"]
},
{
"id": "seed-kcn-014",
"name": "KCN Becamex Bình Phước",
"slug": "becamex-binh-phuoc",
"province": "Bình Phước",
"region": "south",
"status": "under_construction",
"totalAreaHa": 4686,
"occupancyRate": 25,
"establishedYear": 2021,
"landRentUsdM2Year": 50,
"rbfRentUsdM2Month": 3.5,
"rbwRentUsdM2Month": 3.0,
"connectivity": {
"nearestPort": {"name": "Cảng Cát Lái", "distanceKm": 85},
"airport": {"name": "Tân Sơn Nhất", "distanceKm": 80},
"highway": {"name": "QL 13", "distanceKm": 3}
},
"incentives": {"specialZone": true},
"targetIndustries": ["agriculture processing", "rubber", "wood processing", "light manufacturing"]
},
{
"id": "seed-kcn-015",
"name": "KCN Đại An Hải Dương",
"slug": "dai-an-hai-duong",
"province": "Hải Dương",
"region": "north",
"status": "operational",
"totalAreaHa": 174,
"occupancyRate": 90,
"establishedYear": 2003,
"landRentUsdM2Year": 70,
"rbfRentUsdM2Month": 4.2,
"rbwRentUsdM2Month": 3.5,
"connectivity": {
"nearestPort": {"name": "Cảng Hải Phòng", "distanceKm": 50},
"airport": {"name": "Nội Bài", "distanceKm": 60},
"highway": {"name": "QL 5", "distanceKm": 2}
},
"incentives": {"specialZone": false},
"targetIndustries": ["garment", "food processing", "mechanics", "electronics assembly"]
},
{
"id": "seed-kcn-016",
"name": "KCN DEEP C Hải Phòng",
"slug": "deep-c-hai-phong",
"province": "Hải Phòng",
"region": "north",
"status": "operational",
"totalAreaHa": 3000,
"occupancyRate": 68,
"establishedYear": 1997,
"landRentUsdM2Year": 75,
"rbfRentUsdM2Month": 4.5,
"rbwRentUsdM2Month": 3.8,
"connectivity": {
"nearestPort": {"name": "Cảng Đình Vũ", "distanceKm": 5},
"airport": {"name": "Cát Bi", "distanceKm": 12},
"highway": {"name": "Cao tốc Hà Nội - Hải Phòng", "distanceKm": 8}
},
"incentives": {"specialZone": true},
"targetIndustries": ["petrochemicals", "LNG", "electronics", "logistics", "renewable energy"]
},
{
"id": "seed-kcn-017",
"name": "KCN Mỹ Phước 3 Bình Dương",
"slug": "my-phuoc-3-binh-duong",
"province": "Bình Dương",
"region": "south",
"status": "operational",
"totalAreaHa": 992,
"occupancyRate": 87,
"establishedYear": 2006,
"landRentUsdM2Year": 82,
"rbfRentUsdM2Month": 4.8,
"rbwRentUsdM2Month": 4.0,
"connectivity": {
"nearestPort": {"name": "Cảng Cát Lái", "distanceKm": 40},
"airport": {"name": "Tân Sơn Nhất", "distanceKm": 35},
"highway": {"name": "Mỹ Phước - Tân Vạn", "distanceKm": 1}
},
"incentives": {"specialZone": false},
"targetIndustries": ["furniture", "garment", "food processing", "electronics assembly", "plastics"]
},
{
"id": "seed-kcn-018",
"name": "KCN Phú Mỹ 2 BRVT",
"slug": "phu-my-2-brvt",
"province": "Bà Rịa - Vũng Tàu",
"region": "south",
"status": "operational",
"totalAreaHa": 380,
"occupancyRate": 65,
"establishedYear": 2007,
"landRentUsdM2Year": 55,
"rbfRentUsdM2Month": 3.5,
"rbwRentUsdM2Month": 3.0,
"connectivity": {
"nearestPort": {"name": "Cảng Cái Mép - Thị Vải", "distanceKm": 10},
"airport": {"name": "Long Thành", "distanceKm": 40},
"highway": {"name": "QL 51", "distanceKm": 3}
},
"incentives": {"specialZone": true},
"targetIndustries": ["petrochemicals", "steel", "power generation", "port logistics"]
},
{
"id": "seed-kcn-019",
"name": "KCN WHA Nghệ An",
"slug": "wha-nghe-an",
"province": "Nghệ An",
"region": "central",
"status": "under_construction",
"totalAreaHa": 498,
"occupancyRate": 15,
"establishedYear": 2022,
"landRentUsdM2Year": 45,
"rbfRentUsdM2Month": 3.0,
"rbwRentUsdM2Month": 2.5,
"connectivity": {
"nearestPort": {"name": "Cảng Cửa Lò", "distanceKm": 15},
"airport": {"name": "Vinh", "distanceKm": 20},
"highway": {"name": "QL 1A", "distanceKm": 5}
},
"incentives": {"specialZone": true},
"targetIndustries": ["electronics assembly", "garment", "food processing", "rubber"]
},
{
"id": "seed-kcn-020",
"name": "KCN Chu Lai Quảng Nam",
"slug": "chu-lai-quang-nam",
"province": "Quảng Nam",
"region": "central",
"status": "operational",
"totalAreaHa": 1550,
"occupancyRate": 55,
"establishedYear": 2003,
"landRentUsdM2Year": 40,
"rbfRentUsdM2Month": 2.8,
"rbwRentUsdM2Month": 2.2,
"connectivity": {
"nearestPort": {"name": "Cảng Kỳ Hà", "distanceKm": 5},
"airport": {"name": "Chu Lai", "distanceKm": 8},
"highway": {"name": "QL 1A", "distanceKm": 3}
},
"incentives": {"specialZone": true},
"targetIndustries": ["automotive", "agriculture machinery", "wood processing", "seafood processing"]
}
]

View File

@@ -0,0 +1,188 @@
{
"version": "ridge-industrial-v1",
"trained_at": "2026-04-18T08:19:02.245595+00:00",
"n_parks_in_source": 20,
"heads": {
"land": {
"target_column": "landRentUsdM2Year",
"n_train": 20,
"alpha": 7.847599703514607,
"mape_loo": 0.1463,
"coverage_80_loo": 0.8,
"q80_log": 0.1883,
"top_coefficients": [
{
"feature": "region_central",
"coef": -0.0873
},
{
"feature": "log_fdi_province",
"coef": 0.0856
},
{
"feature": "occupancy",
"coef": 0.0618
},
{
"feature": "ind_electronics",
"coef": 0.0502
},
{
"feature": "log_dist_airport_km",
"coef": -0.0355
},
{
"feature": "ind_plastics",
"coef": -0.0259
},
{
"feature": "ind_garment",
"coef": 0.0124
},
{
"feature": "region_north",
"coef": -0.0117
}
],
"slices": {
"central": {
"n": 2,
"mape_in_sample": 0.1158,
"median_residual_log": -0.1966
},
"north": {
"n": 7,
"mape_in_sample": 0.0697,
"median_residual_log": -0.0146
},
"south": {
"n": 11,
"mape_in_sample": 0.095,
"median_residual_log": 0.0298
}
}
},
"rbf": {
"target_column": "rbfRentUsdM2Month",
"n_train": 20,
"alpha": 7.847599703514607,
"mape_loo": 0.1118,
"coverage_80_loo": 0.8,
"q80_log": 0.1268,
"top_coefficients": [
{
"feature": "log_fdi_province",
"coef": 0.0582
},
{
"feature": "region_central",
"coef": -0.0529
},
{
"feature": "ind_electronics",
"coef": 0.0348
},
{
"feature": "occupancy",
"coef": 0.0318
},
{
"feature": "log_dist_airport_km",
"coef": -0.0239
},
{
"feature": "ind_plastics",
"coef": -0.0181
},
{
"feature": "log_dist_highway_km",
"coef": -0.0106
},
{
"feature": "ind_food",
"coef": 0.0065
}
],
"slices": {
"central": {
"n": 2,
"mape_in_sample": 0.089,
"median_residual_log": -0.1132
},
"north": {
"n": 7,
"mape_in_sample": 0.0601,
"median_residual_log": -0.0016
},
"south": {
"n": 11,
"mape_in_sample": 0.0758,
"median_residual_log": 0.0139
}
}
},
"rbw": {
"target_column": "rbwRentUsdM2Month",
"n_train": 20,
"alpha": 7.847599703514607,
"mape_loo": 0.1243,
"coverage_80_loo": 0.8,
"q80_log": 0.1214,
"top_coefficients": [
{
"feature": "log_fdi_province",
"coef": 0.0604
},
{
"feature": "region_central",
"coef": -0.0562
},
{
"feature": "ind_electronics",
"coef": 0.0389
},
{
"feature": "occupancy",
"coef": 0.0297
},
{
"feature": "ind_plastics",
"coef": -0.0217
},
{
"feature": "log_dist_airport_km",
"coef": -0.0196
},
{
"feature": "log_dist_highway_km",
"coef": -0.0114
},
{
"feature": "region_north",
"coef": -0.0054
}
],
"slices": {
"central": {
"n": 2,
"mape_in_sample": 0.1026,
"median_residual_log": -0.1232
},
"north": {
"n": 7,
"mape_in_sample": 0.0668,
"median_residual_log": -0.0088
},
"south": {
"n": 11,
"mape_in_sample": 0.0773,
"median_residual_log": 0.0175
}
}
}
},
"warnings": [
"n_train < 30 per head — LOO metrics are noisy; interpret CIs as wide.",
"Targets are log1p-transformed rent; CIs use conformal quantile on log residuals."
]
}

View File

@@ -0,0 +1,458 @@
"""Train the v1 park-level industrial AVM baseline (ridge + monotonic priors).
Context (TEC-2768 / R5.2.1):
The IndustrialPark table ships with ~20 seeded rows carrying three rent
heads: land (usd/m²/year), RBF (ready-built factory, usd/m²/month), and
RBW (ready-built warehouse, usd/m²/month). No IndustrialListing rows are
seeded, so tree-boosted models are not viable at n=20. This script fits a
regularized linear baseline on log-rent with sign-constrained coefficients
that encode domain monotonicity priors (occupancy ↑ rent, distance ↑ rent
↓, etc.). Conformal prediction over LOO residuals gives the 80% CI band.
Usage:
python libs/ai-services/scripts/train_avm_industrial_park.py \
--input libs/ai-services/data/industrial/parks.json \
--out libs/ai-services/models
Produces:
<out>/avm_industrial_park_ridge_v1.pkl — fitted artifact
<out>/avm_industrial_park_ridge_v1.model_card.json — metrics + slices
"""
from __future__ import annotations
import argparse
import json
import math
import os
import pickle
import sys
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any
import numpy as np
from scipy.optimize import nnls
from sklearn.preprocessing import StandardScaler
# ── Constants ──────────────────────────────────────────────────
ARTIFACT_VERSION = "ridge-industrial-v1"
CURRENT_YEAR = 2026
REGION_ORDER = ["south", "north", "central"] # drop-first encoding
TOP_INDUSTRIES = ["electronics", "logistics", "automotive", "food", "garment", "plastics"]
# Province → FDI inflow in million USD (trailing 12m, approximate market data).
PROVINCE_FDI_MUSD: dict[str, float] = {
"tp. hồ chí minh": 5500,
"hà nội": 4200,
"bình dương": 4800,
"đồng nai": 3200,
"bắc ninh": 5800,
"hải phòng": 2800,
"long an": 1500,
"bà rịa - vũng tàu": 1800,
"hải dương": 800,
"hưng yên": 1200,
"bình phước": 400,
"nghệ An": 350,
"nghệ an": 350,
"quảng nam": 500,
"quảng ngãi": 600,
}
DEFAULT_FDI = 500.0
# Feature expected-sign map (+1 rent↑ when feature↑, 1 rent↓ when feature↑).
# Region one-hots stay unsigned (fixed effect).
SIGN_PRIORS: dict[str, int] = {
"occupancy": +1,
"log_area_ha": +1,
"park_age_years": -1,
"log_dist_port_km": -1,
"log_dist_airport_km": -1,
"log_dist_highway_km": -1,
"logistics_connectivity_score": +1,
"log_fdi_province": +1,
"has_special_zone": +1,
"ind_electronics": +1,
"ind_logistics": +1,
"ind_automotive": +1,
"ind_food": 0,
"ind_garment": 0,
"ind_plastics": 0,
}
MONOTONIC_FEATURES = [f for f, s in SIGN_PRIORS.items() if s != 0]
REGION_FEATURES = [f"region_{r}" for r in REGION_ORDER[1:]] # drop south
ALL_FEATURES = list(SIGN_PRIORS.keys()) + REGION_FEATURES
# ── Feature engineering ────────────────────────────────────────
@dataclass
class FeatureSpec:
"""Serializable feature spec so the loader can recreate training features."""
feature_cols: list[str] = field(default_factory=lambda: list(ALL_FEATURES))
region_order: list[str] = field(default_factory=lambda: list(REGION_ORDER))
top_industries: list[str] = field(default_factory=lambda: list(TOP_INDUSTRIES))
province_fdi: dict[str, float] = field(default_factory=lambda: dict(PROVINCE_FDI_MUSD))
default_fdi: float = DEFAULT_FDI
sign_priors: dict[str, int] = field(default_factory=lambda: dict(SIGN_PRIORS))
current_year: int = CURRENT_YEAR
def _connectivity_distance(conn: dict | None, key: str, default: float) -> float:
if not conn or not isinstance(conn, dict):
return default
node = conn.get(key)
if isinstance(node, dict):
dist = node.get("distanceKm") or node.get("km")
if isinstance(dist, (int, float)) and dist >= 0:
return float(dist)
return default
def _logistics_score(dist_port: float, dist_airport: float, dist_highway: float) -> float:
# Inverse-distance composite scaled to [0, 1]. Weights bias toward highway
# proximity which matters most for trucking in VN industrial flows.
def inv(d: float, cap: float) -> float:
return max(0.0, 1.0 - min(d, cap) / cap)
return round(
0.25 * inv(dist_port, 120)
+ 0.20 * inv(dist_airport, 80)
+ 0.55 * inv(dist_highway, 20),
4,
)
def _industry_match(industries: list[str], target: str) -> int:
lowered = [i.lower() for i in industries or []]
return int(any(target in i for i in lowered))
def featureize(row: dict, spec: FeatureSpec) -> dict[str, float]:
"""Turn one park record into the flat feature vector used by the ridge."""
occupancy = row.get("occupancyRate") or 0
if occupancy > 1.5: # seed stores 0-100, plan normalizes to [0,1]
occupancy = occupancy / 100.0
occupancy = min(max(occupancy, 0.0), 1.0)
area_ha = float(row.get("totalAreaHa") or 0.0)
established = row.get("establishedYear") or (spec.current_year - 10)
park_age = max(0, spec.current_year - int(established))
conn = row.get("connectivity") or {}
dist_port = _connectivity_distance(conn, "nearestPort", 60.0)
dist_airport = _connectivity_distance(conn, "airport", 30.0)
dist_highway = _connectivity_distance(conn, "highway", 5.0)
logistics_score = _logistics_score(dist_port, dist_airport, dist_highway)
province = (row.get("province") or "").strip().lower()
fdi = spec.province_fdi.get(province, spec.default_fdi)
incentives = row.get("incentives") or {}
has_special = int(bool(incentives.get("specialZone")))
industries = row.get("targetIndustries") or []
region = str(row.get("region") or "south").lower()
feats = {
"occupancy": occupancy,
"log_area_ha": math.log1p(area_ha),
"park_age_years": float(park_age),
"log_dist_port_km": math.log1p(dist_port),
"log_dist_airport_km": math.log1p(dist_airport),
"log_dist_highway_km": math.log1p(dist_highway),
"logistics_connectivity_score": logistics_score,
"log_fdi_province": math.log1p(fdi),
"has_special_zone": float(has_special),
}
for ind in spec.top_industries:
feats[f"ind_{ind}"] = float(_industry_match(industries, ind))
for r in spec.region_order[1:]:
feats[f"region_{r}"] = float(region == r)
return feats
def build_feature_matrix(rows: list[dict], spec: FeatureSpec) -> tuple[np.ndarray, list[str]]:
mats = [featureize(r, spec) for r in rows]
cols = spec.feature_cols
X = np.array([[m[c] for c in cols] for m in mats], dtype=np.float64)
return X, cols
# ── Sign-constrained ridge ─────────────────────────────────────
def fit_ridge_nn(X: np.ndarray, y: np.ndarray, alpha: float, sign_vec: np.ndarray) -> np.ndarray:
"""Fit `y ≈ X @ β` with ridge penalty α and sign constraints.
sign_vec[i] ∈ {1, 0, +1}. For +1/1 entries, the returned coefficient is
constrained to have that sign. Solved as NNLS on the augmented system:
minimize ‖[X; sqrt(α)*I] β̃ [y; 0]‖² subject to β̃ ≥ 0
with features pre-multiplied by sign_vec (so "1" features become "expect
positive after flipping"). For sign 0 (e.g. neutral industry flags) we keep
the feature unsigned by solving the corresponding coefficient on ±-split
columns.
"""
n, p = X.shape
# Expand each sign==0 column into two columns (positive and negative part)
# so the NNLS solve can recover an unconstrained coefficient as β = β⁺ β⁻.
expand_cols: list[np.ndarray] = []
col_meta: list[tuple[int, int]] = [] # (orig_idx, +1 or -1)
for j in range(p):
if sign_vec[j] == 0:
expand_cols.append(X[:, j])
col_meta.append((j, +1))
expand_cols.append(-X[:, j])
col_meta.append((j, -1))
else:
# Flip so expected sign becomes +, enabling non-negativity constraint.
expand_cols.append(sign_vec[j] * X[:, j])
col_meta.append((j, int(sign_vec[j])))
X_exp = np.stack(expand_cols, axis=1)
# Augment for ridge.
k = X_exp.shape[1]
X_aug = np.vstack([X_exp, math.sqrt(alpha) * np.eye(k)])
y_aug = np.concatenate([y, np.zeros(k)])
beta_exp, _ = nnls(X_aug, y_aug, maxiter=5 * k)
# Collapse expanded coefs back to original column indices.
beta = np.zeros(p)
for col_idx, (orig_j, sgn) in enumerate(col_meta):
if sign_vec[orig_j] == 0:
beta[orig_j] += sgn * beta_exp[col_idx]
else:
# sgn == sign_vec[orig_j]; β was fit on flipped column, so flip back.
beta[orig_j] = sgn * beta_exp[col_idx]
return beta
# ── Model selection + conformal CI ─────────────────────────────
def _pred(X: np.ndarray, beta: np.ndarray, intercept: float) -> np.ndarray:
return X @ beta + intercept
def loo_cv_mape(
X: np.ndarray,
y_log: np.ndarray,
alpha: float,
sign_vec: np.ndarray,
scaler: StandardScaler,
) -> tuple[float, np.ndarray]:
"""Return (MAPE on original rent scale, LOO residual vector in log-space)."""
n = X.shape[0]
residuals_log = np.zeros(n)
preds_rent = np.zeros(n)
for i in range(n):
mask = np.ones(n, dtype=bool)
mask[i] = False
X_train_raw = X[mask]
X_train = scaler.fit_transform(X_train_raw)
y_train = y_log[mask]
intercept = float(np.mean(y_train))
X_cent = X_train
beta = fit_ridge_nn(X_cent, y_train - intercept, alpha, sign_vec)
x_test = scaler.transform(X[i : i + 1])
yhat_log = float(_pred(x_test, beta, intercept)[0])
residuals_log[i] = y_log[i] - yhat_log
preds_rent[i] = math.expm1(yhat_log)
y_true = np.expm1(y_log)
mape = float(np.mean(np.abs(preds_rent - y_true) / np.maximum(y_true, 1e-6)))
return mape, residuals_log
def conformal_coverage(residuals_log: np.ndarray, q: float) -> float:
return float(np.mean(np.abs(residuals_log) <= q))
# ── Training pipeline ──────────────────────────────────────────
def train_head(
rows: list[dict],
target_key: str,
spec: FeatureSpec,
) -> dict[str, Any]:
"""Fit one rent head and return a serializable head dict."""
valid = [r for r in rows if r.get(target_key) is not None]
if len(valid) < 8:
raise ValueError(f"Head '{target_key}': only {len(valid)} non-null rows — too few to train.")
X, cols = build_feature_matrix(valid, spec)
y_raw = np.array([r[target_key] for r in valid], dtype=np.float64)
y_log = np.log1p(y_raw)
sign_vec = np.array([spec.sign_priors.get(c, 0) for c in cols], dtype=np.int8)
# Fit scaler on full (we also refit per-fold in LOO; this one is for final model).
scaler_final = StandardScaler()
scaler_final.fit(X)
alphas = np.logspace(-2, 3, 20)
best = None
for a in alphas:
mape, res = loo_cv_mape(X, y_log, a, sign_vec, StandardScaler())
if best is None or mape < best["mape"]:
best = {"alpha": a, "mape": mape, "residuals_log": res}
assert best is not None
# Refit on full set with chosen alpha.
X_std = scaler_final.transform(X)
intercept = float(np.mean(y_log))
beta = fit_ridge_nn(X_std, y_log - intercept, best["alpha"], sign_vec)
q80 = float(np.quantile(np.abs(best["residuals_log"]), 0.80))
coverage = conformal_coverage(best["residuals_log"], q80)
# Per-region slice metrics.
slices: dict[str, dict[str, float]] = {}
regions = np.array([r.get("region", "south") for r in valid])
preds_rent = np.expm1(X_std @ beta + intercept)
y_rent = np.expm1(y_log)
for region in np.unique(regions):
idx = np.where(regions == region)[0]
if idx.size == 0:
continue
mape_slice = float(
np.mean(np.abs(preds_rent[idx] - y_rent[idx]) / np.maximum(y_rent[idx], 1e-6))
)
slices[region] = {
"n": int(idx.size),
"mape_in_sample": round(mape_slice, 4),
"median_residual_log": round(float(np.median(best["residuals_log"][idx])), 4),
}
return {
"coefficients": beta,
"intercept": intercept,
"scaler": scaler_final,
"alpha": float(best["alpha"]),
"q80_log": q80,
"feature_cols": cols,
"sign_vec": sign_vec,
"n_train": len(valid),
"mape_loo": round(float(best["mape"]), 4),
"coverage_80_loo": round(coverage, 4),
"slices": slices,
}
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser()
parser.add_argument(
"--input",
default=os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"data/industrial/parks.json",
),
)
parser.add_argument(
"--out",
default=os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"models",
),
)
args = parser.parse_args(argv)
with open(args.input, "r", encoding="utf-8") as f:
rows: list[dict] = json.load(f)
spec = FeatureSpec()
head_specs = {
"land": "landRentUsdM2Year",
"rbf": "rbfRentUsdM2Month",
"rbw": "rbwRentUsdM2Month",
}
heads: dict[str, dict[str, Any]] = {}
card_heads: dict[str, dict[str, Any]] = {}
for head_name, target_key in head_specs.items():
print(f"→ Training head '{head_name}' on target '{target_key}'...")
head = train_head(rows, target_key, spec)
heads[head_name] = head
card_heads[head_name] = {
"target_column": target_key,
"n_train": head["n_train"],
"alpha": head["alpha"],
"mape_loo": head["mape_loo"],
"coverage_80_loo": head["coverage_80_loo"],
"q80_log": round(head["q80_log"], 4),
"top_coefficients": _top_coefs(head),
"slices": head["slices"],
}
print(
f" α={head['alpha']:.4g} MAPE_LOO={head['mape_loo']:.3f}"
f" coverage_80={head['coverage_80_loo']:.3f} n={head['n_train']}"
)
os.makedirs(args.out, exist_ok=True)
pkl_path = os.path.join(args.out, "avm_industrial_park_ridge_v1.pkl")
card_path = os.path.join(args.out, "avm_industrial_park_ridge_v1.model_card.json")
# Serialize to a plain-dict artifact — no trainer class references — so the
# API loader can unpickle without importing this training module.
artifact = {
"version": ARTIFACT_VERSION,
"feature_spec": {
"feature_cols": spec.feature_cols,
"region_order": spec.region_order,
"top_industries": spec.top_industries,
"province_fdi": spec.province_fdi,
"default_fdi": spec.default_fdi,
"sign_priors": spec.sign_priors,
"current_year": spec.current_year,
},
"heads": {
name: {
"coefficients": np.asarray(head["coefficients"], dtype=np.float64),
"intercept": float(head["intercept"]),
"scaler_mean": np.asarray(head["scaler"].mean_, dtype=np.float64),
"scaler_scale": np.asarray(head["scaler"].scale_, dtype=np.float64),
"alpha": head["alpha"],
"q80_log": head["q80_log"],
"feature_cols": head["feature_cols"],
"n_train": head["n_train"],
"mape_loo": head["mape_loo"],
"coverage_80_loo": head["coverage_80_loo"],
}
for name, head in heads.items()
},
"trained_at": datetime.now(timezone.utc).isoformat(),
}
with open(pkl_path, "wb") as f:
pickle.dump(artifact, f)
card = {
"version": ARTIFACT_VERSION,
"trained_at": artifact["trained_at"],
"n_parks_in_source": len(rows),
"heads": card_heads,
"warnings": [
"n_train < 30 per head — LOO metrics are noisy; interpret CIs as wide.",
"Targets are log1p-transformed rent; CIs use conformal quantile on log residuals.",
],
}
with open(card_path, "w", encoding="utf-8") as f:
json.dump(card, f, ensure_ascii=False, indent=2)
print(f"\n✓ Wrote artifact → {pkl_path}")
print(f"✓ Wrote model card → {card_path}")
return 0
def _top_coefs(head: dict[str, Any], k: int = 8) -> list[dict[str, float]]:
beta = head["coefficients"]
cols = head["feature_cols"]
order = np.argsort(-np.abs(beta))[:k]
return [
{"feature": cols[i], "coef": round(float(beta[i]), 4)}
for i in order
if abs(beta[i]) > 1e-6
]
if __name__ == "__main__":
sys.exit(main())

View File

@@ -1,11 +1,19 @@
"""Tests for industrial AVM rent estimation endpoint."""
from pathlib import Path
import pytest
from fastapi.testclient import TestClient
from app.main import app
from app.models.avm_industrial import IndustrialAVMRequest
client = TestClient(app)
REPO_ROOT = Path(__file__).resolve().parent.parent
RIDGE_MODEL_DIR = REPO_ROOT / "models"
RIDGE_ARTIFACT = RIDGE_MODEL_DIR / "avm_industrial_park_ridge_v1.pkl"
# ── Minimal valid request payload ───────────────────────────────
_PREDICT_PAYLOAD = {
@@ -178,3 +186,99 @@ def test_predict_industrial_invalid_occupancy():
json={**_PREDICT_PAYLOAD, "park_occupancy_rate": 1.5},
)
assert resp.status_code == 422
# ── Ridge v1 artifact tests (TEC-2768) ───────────────────────────────
_RIDGE_REQ = IndustrialAVMRequest(
province="Bình Dương",
region="south",
park_occupancy_rate=0.85,
park_area_ha=500,
park_age_years=10,
distance_to_port_km=25,
distance_to_airport_km=20,
distance_to_highway_km=2,
property_type="ready_built_factory",
area_m2=5000,
ceiling_height_m=10,
floor_load_ton_m2=3.0,
power_capacity_kva=1500,
building_coverage=0.55,
loading_docks=4,
zoning="general_industrial",
industry_demand_index=0.7,
fdi_province_musd=4800,
labor_cost_province_vnd=8_500_000,
logistics_connectivity_score=0.85,
)
def _fresh_service_with_model_dir(model_dir: Path):
"""Build a fresh service instance pointed at `model_dir`.
Needed because `industrial_avm_service` is a module-level singleton whose
backend is decided at import time.
"""
from app.config import settings
from app.services.avm_industrial_service import IndustrialAVMService
original = settings.model_path
settings.model_path = str(model_dir)
try:
return IndustrialAVMService()
finally:
settings.model_path = original
@pytest.mark.skipif(not RIDGE_ARTIFACT.exists(), reason="ridge artifact not built")
def test_predict_uses_ridge_when_artifact_present():
svc = _fresh_service_with_model_dir(RIDGE_MODEL_DIR)
assert svc._backend == "ridge"
assert svc._model_version == "ridge-industrial-v1"
resp = svc.predict(_RIDGE_REQ)
assert resp.model_version == "ridge-industrial-v1"
assert resp.estimated_rent_usd_m2 > 0
assert resp.rent_range_low_usd_m2 <= resp.estimated_rent_usd_m2
assert resp.rent_range_high_usd_m2 >= resp.estimated_rent_usd_m2
# Conformal band must have strictly positive width.
assert resp.rent_range_high_usd_m2 > resp.rent_range_low_usd_m2
# Confidence should match the stored LOO coverage (≥ 0.75 acceptance).
assert resp.confidence >= 0.75
def test_predict_falls_back_to_heuristic_when_artifact_absent(tmp_path: Path):
svc = _fresh_service_with_model_dir(tmp_path) # empty dir → no artifacts
assert svc._backend == "heuristic"
resp = svc.predict(_RIDGE_REQ)
assert resp.model_version == "heuristic-v1"
assert resp.estimated_rent_usd_m2 > 0
@pytest.mark.skipif(not RIDGE_ARTIFACT.exists(), reason="ridge artifact not built")
def test_ridge_monotonic_occupancy():
svc = _fresh_service_with_model_dir(RIDGE_MODEL_DIR)
low = svc.predict(_RIDGE_REQ.model_copy(update={"park_occupancy_rate": 0.30}))
high = svc.predict(_RIDGE_REQ.model_copy(update={"park_occupancy_rate": 0.95}))
assert high.estimated_rent_usd_m2 >= low.estimated_rent_usd_m2
@pytest.mark.skipif(not RIDGE_ARTIFACT.exists(), reason="ridge artifact not built")
def test_ridge_land_head_conversion():
"""industrial_land requests must convert annual → monthly USD/m²."""
svc = _fresh_service_with_model_dir(RIDGE_MODEL_DIR)
resp = svc.predict(_RIDGE_REQ.model_copy(update={"property_type": "industrial_land"}))
# annual_rent_usd_m2 ≈ 12 × estimated_rent_usd_m2 (with rounding tolerance)
assert resp.estimated_rent_usd_m2 > 0
assert abs(resp.annual_rent_usd_m2 - resp.estimated_rent_usd_m2 * 12) < 0.5
@pytest.mark.skipif(not RIDGE_ARTIFACT.exists(), reason="ridge artifact not built")
def test_ridge_warehouse_head_different_from_factory():
"""Warehouse and factory requests must route to different ridge heads."""
svc = _fresh_service_with_model_dir(RIDGE_MODEL_DIR)
rbf = svc.predict(_RIDGE_REQ.model_copy(update={"property_type": "ready_built_factory"}))
rbw = svc.predict(_RIDGE_REQ.model_copy(update={"property_type": "warehouse"}))
# Training data consistently shows RBF > RBW rents — heads should reflect that.
assert rbf.estimated_rent_usd_m2 != rbw.estimated_rent_usd_m2

View File

@@ -0,0 +1,504 @@
/**
* Integration test: verifies all MCP servers register correctly in McpRegistryService
* and each tool is callable with valid response schemas.
*
* External HTTP calls (AI service, NestJS API) are mocked via globalThis.fetch.
* Typesense is mocked at the client level.
*/
import type { Client as TypesenseClient } from 'typesense';
import { describe, it, expect, vi, beforeAll, afterAll } from 'vitest';
import { createIndustrialParksServer } from '../industrial-parks/industrial-parks.server';
import { createMarketAnalyticsServer } from '../market-analytics/market-analytics.server';
import { createPropertySearchServer } from '../property-search/property-search.server';
import { createReportsServer } from '../reports/reports.server';
import { createValuationServer } from '../valuation/valuation.server';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
type ToolResult = {
content: { type: string; text: string }[];
isError?: boolean;
};
// ---------------------------------------------------------------------------
// Mocks — Typesense client
// ---------------------------------------------------------------------------
function createMockTypesenseClient(defaultHits: unknown[] = []) {
const search = vi.fn().mockResolvedValue({
hits: defaultHits.map((d) => ({ document: d })),
found: defaultHits.length,
search_time_ms: 2,
});
return {
collections: vi.fn().mockReturnValue({
documents: vi.fn().mockReturnValue({ search }),
}),
_search: search,
};
}
// ---------------------------------------------------------------------------
// Mocks — fetch responses for each backend
// ---------------------------------------------------------------------------
const MOCK_RESPONSES: Record<string, unknown> = {
'/industrial/analyze-location': {
overall_score: 8.2,
connectivity: {
nearest_port: { name: 'Cảng Cát Lái', distanceKm: 22 },
nearest_airport: { name: 'Tân Sơn Nhất', distanceKm: 28 },
nearest_highway: { name: 'QL1A', distanceKm: 1.5 },
},
infrastructure: {
power_availability: '110kV on-site',
water_supply: 'Municipal',
wastewater_treatment: 'Central WWTP',
telecom: 'Fiber optic',
},
labor_market: {
worker_pool_radius_30km: 450000,
average_wage_usd: 290,
nearby_universities: ['ĐH Bình Dương'],
},
incentives: ['CIT exemption 4 years'],
risks: ['Flooding risk'],
},
'/industrial/estimate-rent': {
estimated_rent_usd_m2: 4.5,
pricing_unit: 'USD/m²/month',
total_monthly_usd: 45000,
total_lease_usd: 5400000,
management_fee_usd_m2: 0.6,
deposit_months: 3,
market_comparison: {
province_low: 3.0,
province_high: 7.0,
province_avg: 4.8,
},
breakdown: [
{ item: 'Base rent', amount: 38000 },
{ item: 'Management fee', amount: 6000 },
],
},
'/reports/generate': {
report_id: 'rpt-int-001',
report_type: 'market_overview',
title: 'Báo cáo thị trường Q7',
location: 'Quận 7, Hồ Chí Minh',
generated_at: '2026-04-16T10:00:00Z',
summary: 'Thị trường ổn định',
sections: [{ title: 'Tổng quan', content: '...', charts: [] }],
key_metrics: { avgPriceVND: 4_500_000_000 },
},
'/reports/macro-data': {
province: 'Bình Dương',
data: {
gdp: [{ year: 2024, value: 20.1, unit: 'billion USD', yoy_change: 8.6 }],
},
highlights: ['GDP above national average'],
},
};
function mockFetchForUrl(url: string): Response {
for (const [path, body] of Object.entries(MOCK_RESPONSES)) {
if (url.includes(path)) {
return {
ok: true,
status: 200,
json: async () => body,
text: async () => JSON.stringify(body),
} as unknown as Response;
}
}
return {
ok: false,
status: 404,
text: async () => 'Not found',
} as unknown as Response;
}
// ---------------------------------------------------------------------------
// Industrial park sample document (for Typesense search results)
// ---------------------------------------------------------------------------
const SAMPLE_PARK = {
parkId: 'park-int-001',
name: 'KCN VSIP II-A',
nameEn: 'VSIP II-A Industrial Park',
developer: 'VSIP Group',
province: 'Bình Dương',
region: 'south',
status: 'operational',
totalAreaHa: 345,
remainingAreaHa: 62,
occupancyRate: 82,
landRentUsdM2Year: 90,
rbfRentUsdM2Month: 4.8,
rbwRentUsdM2Month: 3.5,
targetIndustries: ['electronics', 'automotive'],
tenantCount: 85,
};
// ---------------------------------------------------------------------------
// Helper: extract tool handler from McpServer internal state
// ---------------------------------------------------------------------------
function getToolHandler(
server: unknown,
name: string,
): (params: unknown) => Promise<ToolResult> {
const tools = (
server as { _registeredTools: Record<string, { handler: (p: unknown) => Promise<ToolResult> }> }
)._registeredTools;
const entry = tools[name];
if (!entry) {
throw new Error(`Tool "${name}" not registered. Available: ${Object.keys(tools).join(', ')}`);
}
return entry.handler;
}
function parseToolResult(result: ToolResult): Record<string, unknown> {
expect(result.content).toHaveLength(1);
expect(result.content[0].type).toBe('text');
return JSON.parse(result.content[0].text) as Record<string, unknown>;
}
// ---------------------------------------------------------------------------
// Integration tests
// ---------------------------------------------------------------------------
describe('MCP Integration: all servers and tools end-to-end', () => {
const typesenseClient = createMockTypesenseClient([SAMPLE_PARK]);
let industrialServer: ReturnType<typeof createIndustrialParksServer>;
let reportsServer: ReturnType<typeof createReportsServer>;
const fetchSpy = vi.spyOn(globalThis, 'fetch');
beforeAll(() => {
fetchSpy.mockImplementation(async (input: string | URL | Request) => {
const url = typeof input === 'string' ? input : input instanceof URL ? input.toString() : input.url;
return mockFetchForUrl(url);
});
industrialServer = createIndustrialParksServer({
typesenseClient: typesenseClient as unknown as TypesenseClient,
collectionName: 'industrial_parks',
aiServiceBaseUrl: 'http://ai-service:8000',
});
reportsServer = createReportsServer({
apiBaseUrl: 'http://api:3001/api/v1',
});
});
afterAll(() => {
fetchSpy.mockRestore();
});
// -----------------------------------------------------------------------
// 1. Server factory tests — all 5 factories produce valid McpServer instances
// -----------------------------------------------------------------------
describe('server factories', () => {
it('creates all 5 server instances without errors', () => {
expect(industrialServer).toBeDefined();
expect(reportsServer).toBeDefined();
const propertySearch = createPropertySearchServer({
typesenseClient: typesenseClient as unknown as TypesenseClient,
collectionName: 'listings',
});
expect(propertySearch).toBeDefined();
const marketAnalytics = createMarketAnalyticsServer({
typesenseClient: typesenseClient as unknown as TypesenseClient,
collectionName: 'listings',
});
expect(marketAnalytics).toBeDefined();
const valuation = createValuationServer({
aiServiceBaseUrl: 'http://ai-service:8000',
});
expect(valuation).toBeDefined();
});
});
// -----------------------------------------------------------------------
// 2. Industrial parks server — 3 tools
// -----------------------------------------------------------------------
describe('industrial-parks server', () => {
it('search_industrial_parks: returns structured results from Typesense', async () => {
const handler = getToolHandler(industrialServer, 'search_industrial_parks');
const result = await handler({
query: 'VSIP Bình Dương',
page: 1,
perPage: 20,
});
expect(result.isError).toBeFalsy();
const data = parseToolResult(result);
// Schema validation
expect(data).toHaveProperty('totalFound');
expect(data).toHaveProperty('page');
expect(data).toHaveProperty('perPage');
expect(data).toHaveProperty('searchTimeMs');
expect(data).toHaveProperty('results');
expect(typeof data.totalFound).toBe('number');
const results = data.results as Record<string, unknown>[];
expect(results.length).toBeGreaterThan(0);
// Validate result item schema
const item = results[0];
expect(item).toHaveProperty('parkId');
expect(item).toHaveProperty('name');
expect(item).toHaveProperty('developer');
expect(item).toHaveProperty('province');
expect(item).toHaveProperty('region');
expect(item).toHaveProperty('status');
expect(item).toHaveProperty('totalAreaHa');
expect(item).toHaveProperty('remainingAreaHa');
expect(item).toHaveProperty('occupancyRate');
expect(item).toHaveProperty('landRentUsdM2Year');
expect(item).toHaveProperty('targetIndustries');
expect(item).toHaveProperty('tenantCount');
});
it('analyze_industrial_location: calls AI service and returns analysis schema', async () => {
const handler = getToolHandler(industrialServer, 'analyze_industrial_location');
const result = await handler({
latitude: 11.05,
longitude: 106.65,
targetIndustry: 'electronics',
});
expect(result.isError).toBeFalsy();
const data = parseToolResult(result);
// Schema validation
expect(data).toHaveProperty('overallScore');
expect(data).toHaveProperty('connectivity');
expect(data).toHaveProperty('infrastructure');
expect(data).toHaveProperty('laborMarket');
expect(data).toHaveProperty('incentives');
expect(data).toHaveProperty('risks');
expect(typeof data.overallScore).toBe('number');
const connectivity = data.connectivity as Record<string, unknown>;
expect(connectivity).toHaveProperty('nearestPort');
expect(connectivity).toHaveProperty('nearestAirport');
// Verify correct URL was called
expect(fetchSpy).toHaveBeenCalledWith(
'http://ai-service:8000/industrial/analyze-location',
expect.objectContaining({ method: 'POST' }),
);
});
it('estimate_industrial_rent: calls AI service and returns rent estimate schema', async () => {
const handler = getToolHandler(industrialServer, 'estimate_industrial_rent');
const result = await handler({
province: 'Bình Dương',
propertyType: 'ready_built_factory',
areaM2: 10000,
leaseDurationYears: 10,
});
expect(result.isError).toBeFalsy();
const data = parseToolResult(result);
// Schema validation
expect(data).toHaveProperty('estimatedRentUsdM2');
expect(data).toHaveProperty('pricingUnit');
expect(data).toHaveProperty('totalMonthlyUsd');
expect(data).toHaveProperty('totalLeaseUsd');
expect(data).toHaveProperty('managementFeeUsdM2');
expect(data).toHaveProperty('depositMonths');
expect(data).toHaveProperty('marketComparison');
expect(data).toHaveProperty('breakdown');
expect(data).toHaveProperty('input');
expect(typeof data.estimatedRentUsdM2).toBe('number');
const mc = data.marketComparison as Record<string, unknown>;
expect(mc).toHaveProperty('provinceLow');
expect(mc).toHaveProperty('provinceHigh');
expect(mc).toHaveProperty('provinceAvg');
// Verify correct URL was called
expect(fetchSpy).toHaveBeenCalledWith(
'http://ai-service:8000/industrial/estimate-rent',
expect.objectContaining({ method: 'POST' }),
);
});
});
// -----------------------------------------------------------------------
// 3. Reports server — 2 tools
// -----------------------------------------------------------------------
describe('reports server', () => {
it('generate_report: calls NestJS API and returns report schema', async () => {
const handler = getToolHandler(reportsServer, 'generate_report');
const result = await handler({
reportType: 'market_overview',
location: 'Quận 7, Hồ Chí Minh',
period: '1y',
includeForecasts: false,
includeMacro: false,
language: 'vi',
});
expect(result.isError).toBeFalsy();
const data = parseToolResult(result);
// Schema validation
expect(data).toHaveProperty('reportId');
expect(data).toHaveProperty('reportType');
expect(data).toHaveProperty('title');
expect(data).toHaveProperty('location');
expect(data).toHaveProperty('generatedAt');
expect(data).toHaveProperty('summary');
expect(data).toHaveProperty('sections');
expect(data).toHaveProperty('keyMetrics');
expect(typeof data.reportId).toBe('string');
expect(Array.isArray(data.sections)).toBe(true);
// Verify correct URL was called (NestJS API, not AI service)
expect(fetchSpy).toHaveBeenCalledWith(
'http://api:3001/api/v1/reports/generate',
expect.objectContaining({ method: 'POST' }),
);
});
it('get_macro_data: calls NestJS API with GET and returns macro data schema', async () => {
const handler = getToolHandler(reportsServer, 'get_macro_data');
const result = await handler({
province: 'Bình Dương',
categories: ['gdp'],
fromYear: 2024,
toYear: 2024,
});
expect(result.isError).toBeFalsy();
const data = parseToolResult(result);
// Schema validation
expect(data).toHaveProperty('province');
expect(data).toHaveProperty('period');
expect(data).toHaveProperty('data');
expect(data).toHaveProperty('highlights');
expect(data.province).toBe('Bình Dương');
const period = data.period as Record<string, number>;
expect(period.from).toBe(2024);
expect(period.to).toBe(2024);
const macroData = data.data as Record<string, unknown[]>;
expect(macroData).toHaveProperty('gdp');
expect(macroData.gdp).toHaveLength(1);
const gdpPoint = macroData.gdp[0] as Record<string, unknown>;
expect(gdpPoint).toHaveProperty('year');
expect(gdpPoint).toHaveProperty('value');
expect(gdpPoint).toHaveProperty('unit');
expect(gdpPoint).toHaveProperty('yoyChange');
// Verify it used GET (not POST)
const macroCall = fetchSpy.mock.calls.find(
(call) => (call[0] as string).includes('/reports/macro-data'),
);
expect(macroCall).toBeDefined();
expect((macroCall![1] as RequestInit).method).toBe('GET');
});
});
// -----------------------------------------------------------------------
// 4. Env var routing: industrial tools → AI_SERVICE_URL, reports → API_BASE_URL
// -----------------------------------------------------------------------
describe('env var routing', () => {
it('industrial tools call aiServiceBaseUrl (AI_SERVICE_URL)', async () => {
const analyzeCall = fetchSpy.mock.calls.find(
(call) => (call[0] as string).includes('/industrial/analyze-location'),
);
expect(analyzeCall).toBeDefined();
expect((analyzeCall![0] as string).startsWith('http://ai-service:8000')).toBe(true);
const rentCall = fetchSpy.mock.calls.find(
(call) => (call[0] as string).includes('/industrial/estimate-rent'),
);
expect(rentCall).toBeDefined();
expect((rentCall![0] as string).startsWith('http://ai-service:8000')).toBe(true);
});
it('report tools call apiBaseUrl (API_BASE_URL)', async () => {
const reportCall = fetchSpy.mock.calls.find(
(call) => (call[0] as string).includes('/reports/generate'),
);
expect(reportCall).toBeDefined();
expect((reportCall![0] as string).startsWith('http://api:3001')).toBe(true);
const macroCall = fetchSpy.mock.calls.find(
(call) => (call[0] as string).includes('/reports/macro-data'),
);
expect(macroCall).toBeDefined();
expect((macroCall![0] as string).startsWith('http://api:3001')).toBe(true);
});
});
// -----------------------------------------------------------------------
// 5. Registry simulation — verify all servers can be registered
// -----------------------------------------------------------------------
describe('registry integration', () => {
it('McpRegistryService registers industrial-parks and reports servers', async () => {
// Simulate what McpRegistryService.onModuleInit does
const servers = new Map<string, unknown>();
servers.set(
'property-search',
createPropertySearchServer({
typesenseClient: typesenseClient as unknown as TypesenseClient,
collectionName: 'listings',
}),
);
servers.set(
'market-analytics',
createMarketAnalyticsServer({
typesenseClient: typesenseClient as unknown as TypesenseClient,
collectionName: 'listings',
}),
);
servers.set(
'valuation',
createValuationServer({ aiServiceBaseUrl: 'http://ai-service:8000' }),
);
servers.set(
'industrial-parks',
createIndustrialParksServer({
typesenseClient: typesenseClient as unknown as TypesenseClient,
collectionName: 'industrial_parks',
aiServiceBaseUrl: 'http://ai-service:8000',
}),
);
servers.set(
'reports',
createReportsServer({ apiBaseUrl: 'http://api:3001/api/v1' }),
);
// All 5 servers should be registered
expect(servers.size).toBe(5);
expect(Array.from(servers.keys()).sort()).toEqual([
'industrial-parks',
'market-analytics',
'property-search',
'reports',
'valuation',
]);
// Each server should be a valid McpServer instance
for (const [name, server] of servers) {
expect(server, `Server "${name}" should be defined`).toBeDefined();
}
});
});
});