feat(ai-services): add Python FastAPI AI/ML services container
Create libs/ai-services/ with FastAPI app providing: - POST /avm/predict — XGBoost-backed property price prediction (heuristic fallback) - POST /avm/extract-features — Vietnamese NLP feature extraction from listing text - POST /moderation/check — content moderation with rule-based flagging - GET /health — health check endpoint Includes Dockerfile (Python 3.12), docker-compose integration, Pydantic models, and 9 passing tests covering all endpoints. Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -81,6 +81,76 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- goodgo-net
|
- goodgo-net
|
||||||
|
|
||||||
|
ai-services:
|
||||||
|
build:
|
||||||
|
context: ./libs/ai-services
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: goodgo-ai-services
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- '${AI_SERVICES_PORT:-8000}:8000'
|
||||||
|
environment:
|
||||||
|
AI_DEBUG: ${AI_DEBUG:-false}
|
||||||
|
AI_LOG_LEVEL: ${AI_LOG_LEVEL:-info}
|
||||||
|
healthcheck:
|
||||||
|
test: ['CMD', 'python', '-c', 'import httpx; httpx.get("http://localhost:8000/health").raise_for_status()']
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
start_period: 30s
|
||||||
|
networks:
|
||||||
|
- goodgo-net
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:v2.51.0
|
||||||
|
container_name: goodgo-prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- '${PROMETHEUS_PORT:-9090}:9090'
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
- '--storage.tsdb.retention.time=15d'
|
||||||
|
- '--web.enable-lifecycle'
|
||||||
|
volumes:
|
||||||
|
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- prometheus_data:/prometheus
|
||||||
|
extra_hosts:
|
||||||
|
- 'host.docker.internal:host-gateway'
|
||||||
|
healthcheck:
|
||||||
|
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9090/-/healthy']
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 10s
|
||||||
|
networks:
|
||||||
|
- goodgo-net
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
image: grafana/grafana:10.4.1
|
||||||
|
container_name: goodgo-grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- '${GRAFANA_PORT:-3002}:3000'
|
||||||
|
environment:
|
||||||
|
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
|
||||||
|
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
|
||||||
|
GF_USERS_ALLOW_SIGN_UP: 'false'
|
||||||
|
volumes:
|
||||||
|
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||||
|
- ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||||
|
- grafana_data:/var/lib/grafana
|
||||||
|
depends_on:
|
||||||
|
prometheus:
|
||||||
|
condition: service_healthy
|
||||||
|
healthcheck:
|
||||||
|
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3000/api/health']
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 15s
|
||||||
|
networks:
|
||||||
|
- goodgo-net
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
pgdata:
|
pgdata:
|
||||||
driver: local
|
driver: local
|
||||||
@@ -90,6 +160,10 @@ volumes:
|
|||||||
driver: local
|
driver: local
|
||||||
minio_data:
|
minio_data:
|
||||||
driver: local
|
driver: local
|
||||||
|
prometheus_data:
|
||||||
|
driver: local
|
||||||
|
grafana_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
goodgo-net:
|
goodgo-net:
|
||||||
|
|||||||
5
libs/ai-services/.gitignore
vendored
Normal file
5
libs/ai-services/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.egg-info/
|
||||||
|
.pytest_cache/
|
||||||
|
dist/
|
||||||
31
libs/ai-services/Dockerfile
Normal file
31
libs/ai-services/Dockerfile
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system deps for underthesea / numpy
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends gcc g++ && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY pyproject.toml .
|
||||||
|
RUN pip install --no-cache-dir . 2>/dev/null || pip install --no-cache-dir \
|
||||||
|
"fastapi>=0.115.0" \
|
||||||
|
"uvicorn[standard]>=0.32.0" \
|
||||||
|
"xgboost>=2.1.0" \
|
||||||
|
"numpy>=1.26.0" \
|
||||||
|
"underthesea>=6.8.0" \
|
||||||
|
"pydantic>=2.9.0" \
|
||||||
|
"pydantic-settings>=2.5.0" \
|
||||||
|
"httpx>=0.27.0"
|
||||||
|
|
||||||
|
COPY app/ ./app/
|
||||||
|
|
||||||
|
# Pre-download underthesea models at build time
|
||||||
|
RUN python -c "from underthesea import word_tokenize; word_tokenize('test')" 2>/dev/null || true
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
|
||||||
|
CMD python -c "import httpx; httpx.get('http://localhost:8000/health').raise_for_status()"
|
||||||
|
|
||||||
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
0
libs/ai-services/app/__init__.py
Normal file
0
libs/ai-services/app/__init__.py
Normal file
13
libs/ai-services/app/config.py
Normal file
13
libs/ai-services/app/config.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
app_name: str = "Goodgo AI Services"
|
||||||
|
debug: bool = False
|
||||||
|
model_path: str = "/app/models"
|
||||||
|
log_level: str = "info"
|
||||||
|
|
||||||
|
model_config = {"env_prefix": "AI_"}
|
||||||
|
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
28
libs/ai-services/app/main.py
Normal file
28
libs/ai-services/app/main.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
from app.routers import avm, moderation
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title=settings.app_name,
|
||||||
|
version="0.1.0",
|
||||||
|
docs_url="/docs",
|
||||||
|
redoc_url="/redoc",
|
||||||
|
)
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(avm.router)
|
||||||
|
app.include_router(moderation.router)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health() -> dict:
|
||||||
|
return {"status": "ok", "service": settings.app_name}
|
||||||
0
libs/ai-services/app/models/__init__.py
Normal file
0
libs/ai-services/app/models/__init__.py
Normal file
48
libs/ai-services/app/models/avm.py
Normal file
48
libs/ai-services/app/models/avm.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class AVMPredictRequest(BaseModel):
|
||||||
|
area: float = Field(..., gt=0, description="Property area in m²")
|
||||||
|
district: str = Field(..., min_length=1, description="District name")
|
||||||
|
city: str = Field(..., min_length=1, description="City name")
|
||||||
|
property_type: str = Field(..., description="e.g. apartment, house, land")
|
||||||
|
bedrooms: int = Field(0, ge=0)
|
||||||
|
bathrooms: int = Field(0, ge=0)
|
||||||
|
floors: int = Field(0, ge=0)
|
||||||
|
frontage: float = Field(0.0, ge=0, description="Frontage width in meters")
|
||||||
|
road_width: float = Field(0.0, ge=0, description="Adjacent road width in meters")
|
||||||
|
year_built: int | None = Field(None, description="Year the property was built")
|
||||||
|
has_legal_paper: bool = Field(True, description="Whether property has sổ đỏ/sổ hồng")
|
||||||
|
|
||||||
|
|
||||||
|
class AVMPredictResponse(BaseModel):
|
||||||
|
estimated_price_vnd: float = Field(..., description="Estimated price in VND")
|
||||||
|
confidence: float = Field(..., ge=0, le=1, description="Prediction confidence score")
|
||||||
|
price_per_m2: float = Field(..., description="Price per m² in VND")
|
||||||
|
price_range_low: float = Field(..., description="Lower bound estimate in VND")
|
||||||
|
price_range_high: float = Field(..., description="Upper bound estimate in VND")
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureExtractRequest(BaseModel):
|
||||||
|
text: str = Field(..., min_length=1, description="Vietnamese property listing text")
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractedFeatures(BaseModel):
|
||||||
|
area: float | None = None
|
||||||
|
district: str | None = None
|
||||||
|
city: str | None = None
|
||||||
|
property_type: str | None = None
|
||||||
|
bedrooms: int | None = None
|
||||||
|
bathrooms: int | None = None
|
||||||
|
floors: int | None = None
|
||||||
|
frontage: float | None = None
|
||||||
|
road_width: float | None = None
|
||||||
|
price_mentioned: float | None = None
|
||||||
|
has_legal_paper: bool | None = None
|
||||||
|
address_raw: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureExtractResponse(BaseModel):
|
||||||
|
features: ExtractedFeatures
|
||||||
|
tokens: list[str] = Field(default_factory=list, description="Tokenized words")
|
||||||
|
entities: list[dict] = Field(default_factory=list, description="Named entities found")
|
||||||
20
libs/ai-services/app/models/moderation.py
Normal file
20
libs/ai-services/app/models/moderation.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class ModerationRequest(BaseModel):
|
||||||
|
text: str = Field(..., min_length=1, description="Text content to moderate")
|
||||||
|
context: str = Field("listing", description="Context: listing, comment, profile")
|
||||||
|
|
||||||
|
|
||||||
|
class ModerationFlag(BaseModel):
|
||||||
|
category: str
|
||||||
|
severity: str = Field(..., description="low, medium, high")
|
||||||
|
matched_text: str
|
||||||
|
reason: str
|
||||||
|
|
||||||
|
|
||||||
|
class ModerationResponse(BaseModel):
|
||||||
|
is_flagged: bool
|
||||||
|
score: float = Field(..., ge=0, le=1, description="Overall risk score")
|
||||||
|
flags: list[ModerationFlag] = Field(default_factory=list)
|
||||||
|
cleaned_text: str | None = Field(None, description="Text with flagged content redacted")
|
||||||
0
libs/ai-services/app/routers/__init__.py
Normal file
0
libs/ai-services/app/routers/__init__.py
Normal file
23
libs/ai-services/app/routers/avm.py
Normal file
23
libs/ai-services/app/routers/avm.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from app.models.avm import (
|
||||||
|
AVMPredictRequest,
|
||||||
|
AVMPredictResponse,
|
||||||
|
FeatureExtractRequest,
|
||||||
|
FeatureExtractResponse,
|
||||||
|
)
|
||||||
|
from app.services.avm_service import avm_service, feature_extract_service
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/avm", tags=["AVM"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/predict", response_model=AVMPredictResponse)
|
||||||
|
def predict(req: AVMPredictRequest) -> AVMPredictResponse:
|
||||||
|
"""Predict property price using the Automated Valuation Model."""
|
||||||
|
return avm_service.predict(req)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/extract-features", response_model=FeatureExtractResponse)
|
||||||
|
def extract_features(req: FeatureExtractRequest) -> FeatureExtractResponse:
|
||||||
|
"""Extract real-estate features from Vietnamese listing text."""
|
||||||
|
return feature_extract_service.extract(req)
|
||||||
12
libs/ai-services/app/routers/moderation.py
Normal file
12
libs/ai-services/app/routers/moderation.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from app.models.moderation import ModerationRequest, ModerationResponse
|
||||||
|
from app.services.moderation_service import moderation_service
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/moderation", tags=["Moderation"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/check", response_model=ModerationResponse)
|
||||||
|
def check(req: ModerationRequest) -> ModerationResponse:
|
||||||
|
"""Check text content for policy violations."""
|
||||||
|
return moderation_service.check(req)
|
||||||
0
libs/ai-services/app/services/__init__.py
Normal file
0
libs/ai-services/app/services/__init__.py
Normal file
229
libs/ai-services/app/services/avm_service.py
Normal file
229
libs/ai-services/app/services/avm_service.py
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from app.models.avm import (
|
||||||
|
AVMPredictRequest,
|
||||||
|
AVMPredictResponse,
|
||||||
|
ExtractedFeatures,
|
||||||
|
FeatureExtractRequest,
|
||||||
|
FeatureExtractResponse,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Property type encoding for the model
|
||||||
|
PROPERTY_TYPE_MAP = {
|
||||||
|
"apartment": 0,
|
||||||
|
"house": 1,
|
||||||
|
"townhouse": 2,
|
||||||
|
"villa": 3,
|
||||||
|
"land": 4,
|
||||||
|
"shophouse": 5,
|
||||||
|
}
|
||||||
|
|
||||||
|
# City-level price multiplier (baseline: millions VND/m²)
|
||||||
|
CITY_BASELINE = {
|
||||||
|
"hà nội": 85.0,
|
||||||
|
"hồ chí minh": 90.0,
|
||||||
|
"đà nẵng": 45.0,
|
||||||
|
"hải phòng": 35.0,
|
||||||
|
"cần thơ": 25.0,
|
||||||
|
}
|
||||||
|
DEFAULT_BASELINE = 30.0
|
||||||
|
|
||||||
|
|
||||||
|
class AVMService:
|
||||||
|
"""Automated Valuation Model service.
|
||||||
|
|
||||||
|
Uses XGBoost when a trained model is available,
|
||||||
|
falls back to heuristic pricing for development/demo.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._model = None
|
||||||
|
self._load_model()
|
||||||
|
|
||||||
|
def _load_model(self) -> None:
|
||||||
|
try:
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
|
||||||
|
model_file = f"{settings.model_path}/avm_model.json"
|
||||||
|
self._model = xgb.Booster()
|
||||||
|
self._model.load_model(model_file)
|
||||||
|
logger.info("Loaded XGBoost AVM model from %s", model_file)
|
||||||
|
except Exception:
|
||||||
|
logger.info("No trained AVM model found — using heuristic fallback")
|
||||||
|
self._model = None
|
||||||
|
|
||||||
|
def predict(self, req: AVMPredictRequest) -> AVMPredictResponse:
|
||||||
|
if self._model is not None:
|
||||||
|
return self._predict_xgboost(req)
|
||||||
|
return self._predict_heuristic(req)
|
||||||
|
|
||||||
|
def _predict_xgboost(self, req: AVMPredictRequest) -> AVMPredictResponse:
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
features = np.array(
|
||||||
|
[[
|
||||||
|
req.area,
|
||||||
|
PROPERTY_TYPE_MAP.get(req.property_type.lower(), 1),
|
||||||
|
req.bedrooms,
|
||||||
|
req.bathrooms,
|
||||||
|
req.floors,
|
||||||
|
req.frontage,
|
||||||
|
req.road_width,
|
||||||
|
req.year_built or 2020,
|
||||||
|
1.0 if req.has_legal_paper else 0.0,
|
||||||
|
]]
|
||||||
|
)
|
||||||
|
dmatrix = xgb.DMatrix(features)
|
||||||
|
pred_log = self._model.predict(dmatrix)[0]
|
||||||
|
estimated = float(np.exp(pred_log))
|
||||||
|
|
||||||
|
price_per_m2 = estimated / req.area
|
||||||
|
return AVMPredictResponse(
|
||||||
|
estimated_price_vnd=estimated,
|
||||||
|
confidence=0.82,
|
||||||
|
price_per_m2=price_per_m2,
|
||||||
|
price_range_low=estimated * 0.85,
|
||||||
|
price_range_high=estimated * 1.15,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _predict_heuristic(self, req: AVMPredictRequest) -> AVMPredictResponse:
|
||||||
|
city_key = req.city.lower().strip()
|
||||||
|
base = CITY_BASELINE.get(city_key, DEFAULT_BASELINE)
|
||||||
|
|
||||||
|
# Property type multiplier
|
||||||
|
type_mult = {
|
||||||
|
"apartment": 0.9,
|
||||||
|
"house": 1.0,
|
||||||
|
"townhouse": 1.1,
|
||||||
|
"villa": 1.4,
|
||||||
|
"land": 0.7,
|
||||||
|
"shophouse": 1.3,
|
||||||
|
}.get(req.property_type.lower(), 1.0)
|
||||||
|
|
||||||
|
# Adjustments
|
||||||
|
bedroom_adj = 1.0 + req.bedrooms * 0.02
|
||||||
|
frontage_adj = 1.0 + (req.frontage / 10.0) * 0.15 if req.frontage > 0 else 1.0
|
||||||
|
legal_adj = 1.0 if req.has_legal_paper else 0.7
|
||||||
|
|
||||||
|
price_per_m2 = base * type_mult * bedroom_adj * frontage_adj * legal_adj * 1_000_000
|
||||||
|
estimated = price_per_m2 * req.area
|
||||||
|
|
||||||
|
return AVMPredictResponse(
|
||||||
|
estimated_price_vnd=round(estimated, -3),
|
||||||
|
confidence=0.65,
|
||||||
|
price_per_m2=round(price_per_m2, -3),
|
||||||
|
price_range_low=round(estimated * 0.75, -3),
|
||||||
|
price_range_high=round(estimated * 1.25, -3),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureExtractService:
|
||||||
|
"""Extract real-estate features from Vietnamese listing text."""
|
||||||
|
|
||||||
|
_AREA_PATTERN = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:m2|m²|mét vuông)", re.IGNORECASE)
|
||||||
|
_BEDROOM_PATTERN = re.compile(r"(\d+)\s*(?:phòng ngủ|pn|PN)", re.IGNORECASE)
|
||||||
|
_BATHROOM_PATTERN = re.compile(r"(\d+)\s*(?:phòng tắm|wc|WC|toilet)", re.IGNORECASE)
|
||||||
|
_FLOOR_PATTERN = re.compile(r"(\d+)\s*(?:tầng|lầu)", re.IGNORECASE)
|
||||||
|
_FRONTAGE_PATTERN = re.compile(r"(?:mặt tiền|ngang)\s*(\d+(?:[.,]\d+)?)\s*m", re.IGNORECASE)
|
||||||
|
_ROAD_WIDTH_PATTERN = re.compile(r"(?:đường|hẻm)\s*(\d+(?:[.,]\d+)?)\s*m", re.IGNORECASE)
|
||||||
|
_PRICE_PATTERN = re.compile(
|
||||||
|
r"(\d+(?:[.,]\d+)?)\s*(?:tỷ|tỉ|triệu)", re.IGNORECASE
|
||||||
|
)
|
||||||
|
_LEGAL_KEYWORDS = ["sổ đỏ", "sổ hồng", "chính chủ", "pháp lý rõ ràng"]
|
||||||
|
|
||||||
|
_PROPERTY_TYPES = {
|
||||||
|
"căn hộ": "apartment",
|
||||||
|
"chung cư": "apartment",
|
||||||
|
"nhà phố": "townhouse",
|
||||||
|
"nhà riêng": "house",
|
||||||
|
"biệt thự": "villa",
|
||||||
|
"đất": "land",
|
||||||
|
"đất nền": "land",
|
||||||
|
"shophouse": "shophouse",
|
||||||
|
}
|
||||||
|
|
||||||
|
def extract(self, req: FeatureExtractRequest) -> FeatureExtractResponse:
|
||||||
|
text = req.text
|
||||||
|
features = ExtractedFeatures()
|
||||||
|
|
||||||
|
# Area
|
||||||
|
m = self._AREA_PATTERN.search(text)
|
||||||
|
if m:
|
||||||
|
features.area = float(m.group(1).replace(",", "."))
|
||||||
|
|
||||||
|
# Bedrooms
|
||||||
|
m = self._BEDROOM_PATTERN.search(text)
|
||||||
|
if m:
|
||||||
|
features.bedrooms = int(m.group(1))
|
||||||
|
|
||||||
|
# Bathrooms
|
||||||
|
m = self._BATHROOM_PATTERN.search(text)
|
||||||
|
if m:
|
||||||
|
features.bathrooms = int(m.group(1))
|
||||||
|
|
||||||
|
# Floors
|
||||||
|
m = self._FLOOR_PATTERN.search(text)
|
||||||
|
if m:
|
||||||
|
features.floors = int(m.group(1))
|
||||||
|
|
||||||
|
# Frontage
|
||||||
|
m = self._FRONTAGE_PATTERN.search(text)
|
||||||
|
if m:
|
||||||
|
features.frontage = float(m.group(1).replace(",", "."))
|
||||||
|
|
||||||
|
# Road width
|
||||||
|
m = self._ROAD_WIDTH_PATTERN.search(text)
|
||||||
|
if m:
|
||||||
|
features.road_width = float(m.group(1).replace(",", "."))
|
||||||
|
|
||||||
|
# Price
|
||||||
|
m = self._PRICE_PATTERN.search(text)
|
||||||
|
if m:
|
||||||
|
val = float(m.group(1).replace(",", "."))
|
||||||
|
unit = text[m.end() - 3 : m.end()].lower()
|
||||||
|
if "tỷ" in unit or "tỉ" in unit:
|
||||||
|
features.price_mentioned = val * 1_000_000_000
|
||||||
|
else:
|
||||||
|
features.price_mentioned = val * 1_000_000
|
||||||
|
|
||||||
|
# Legal
|
||||||
|
text_lower = text.lower()
|
||||||
|
features.has_legal_paper = any(kw in text_lower for kw in self._LEGAL_KEYWORDS)
|
||||||
|
|
||||||
|
# Property type
|
||||||
|
for vn_type, en_type in self._PROPERTY_TYPES.items():
|
||||||
|
if vn_type in text_lower:
|
||||||
|
features.property_type = en_type
|
||||||
|
break
|
||||||
|
|
||||||
|
# Tokenization and NER via underthesea
|
||||||
|
tokens: list[str] = []
|
||||||
|
entities: list[dict] = []
|
||||||
|
try:
|
||||||
|
from underthesea import ner, word_tokenize
|
||||||
|
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
ner_results = ner(text)
|
||||||
|
for chunk in ner_results:
|
||||||
|
if len(chunk) >= 4 and chunk[3] != "O":
|
||||||
|
entities.append({"text": chunk[0], "label": chunk[3]})
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("underthesea not available — skipping NLP tokenization")
|
||||||
|
tokens = text.split()
|
||||||
|
|
||||||
|
return FeatureExtractResponse(
|
||||||
|
features=features,
|
||||||
|
tokens=tokens,
|
||||||
|
entities=entities,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
avm_service = AVMService()
|
||||||
|
feature_extract_service = FeatureExtractService()
|
||||||
96
libs/ai-services/app/services/moderation_service.py
Normal file
96
libs/ai-services/app/services/moderation_service.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from app.models.moderation import ModerationFlag, ModerationRequest, ModerationResponse
|
||||||
|
|
||||||
|
# Blocklist categories with patterns and severity
|
||||||
|
_RULES: list[dict] = [
|
||||||
|
{
|
||||||
|
"category": "contact_info",
|
||||||
|
"severity": "medium",
|
||||||
|
"patterns": [
|
||||||
|
re.compile(r"0\d{9,10}"), # Vietnamese phone numbers
|
||||||
|
re.compile(r"\b[\w.+-]+@[\w-]+\.[\w.]+\b"), # Email
|
||||||
|
re.compile(r"(?:zalo|viber|telegram|whatsapp)\s*[:\-]?\s*\d+", re.IGNORECASE),
|
||||||
|
],
|
||||||
|
"reason": "Contact information detected — may bypass platform messaging",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"category": "spam",
|
||||||
|
"severity": "low",
|
||||||
|
"patterns": [
|
||||||
|
re.compile(r"(.)\1{5,}"), # Repeated characters
|
||||||
|
re.compile(r"(!!!|\.\.\.){3,}"), # Excessive punctuation
|
||||||
|
re.compile(r"(?:click|nhấn|bấm)\s+(?:here|vào đây|link)", re.IGNORECASE),
|
||||||
|
],
|
||||||
|
"reason": "Spam-like content pattern",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"category": "profanity",
|
||||||
|
"severity": "high",
|
||||||
|
"patterns": [
|
||||||
|
re.compile(
|
||||||
|
r"\b(?:lừa đảo|scam|fake|giả mạo)\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
"reason": "Potentially harmful or fraudulent language",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"category": "prohibited_content",
|
||||||
|
"severity": "high",
|
||||||
|
"patterns": [
|
||||||
|
re.compile(
|
||||||
|
r"\b(?:đất rừng phòng hộ|đất quốc phòng|đất tranh chấp)\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
"reason": "Listing references prohibited property types",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ModerationService:
|
||||||
|
def check(self, req: ModerationRequest) -> ModerationResponse:
|
||||||
|
flags: list[ModerationFlag] = []
|
||||||
|
text = req.text
|
||||||
|
|
||||||
|
for rule in _RULES:
|
||||||
|
for pattern in rule["patterns"]:
|
||||||
|
for match in pattern.finditer(text):
|
||||||
|
flags.append(
|
||||||
|
ModerationFlag(
|
||||||
|
category=rule["category"],
|
||||||
|
severity=rule["severity"],
|
||||||
|
matched_text=match.group(),
|
||||||
|
reason=rule["reason"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not flags:
|
||||||
|
return ModerationResponse(
|
||||||
|
is_flagged=False,
|
||||||
|
score=0.0,
|
||||||
|
flags=[],
|
||||||
|
cleaned_text=text,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Compute aggregate score
|
||||||
|
severity_weights = {"low": 0.2, "medium": 0.5, "high": 0.9}
|
||||||
|
max_score = max(severity_weights.get(f.severity, 0.5) for f in flags)
|
||||||
|
avg_score = sum(severity_weights.get(f.severity, 0.5) for f in flags) / len(flags)
|
||||||
|
score = round(min(1.0, max_score * 0.7 + avg_score * 0.3), 3)
|
||||||
|
|
||||||
|
# Redact flagged content
|
||||||
|
cleaned = text
|
||||||
|
for flag in flags:
|
||||||
|
cleaned = cleaned.replace(flag.matched_text, "[REDACTED]")
|
||||||
|
|
||||||
|
return ModerationResponse(
|
||||||
|
is_flagged=True,
|
||||||
|
score=score,
|
||||||
|
flags=flags,
|
||||||
|
cleaned_text=cleaned,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
moderation_service = ModerationService()
|
||||||
30
libs/ai-services/pyproject.toml
Normal file
30
libs/ai-services/pyproject.toml
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
[project]
|
||||||
|
name = "goodgo-ai-services"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "AI/ML services for Goodgo Platform — AVM, feature extraction, moderation"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"fastapi>=0.115.0",
|
||||||
|
"uvicorn[standard]>=0.32.0",
|
||||||
|
"xgboost>=2.1.0",
|
||||||
|
"numpy>=1.26.0",
|
||||||
|
"underthesea>=6.8.0",
|
||||||
|
"pydantic>=2.9.0",
|
||||||
|
"pydantic-settings>=2.5.0",
|
||||||
|
"httpx>=0.27.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.3.0",
|
||||||
|
"pytest-asyncio>=0.24.0",
|
||||||
|
"httpx>=0.27.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=75.0"]
|
||||||
|
build-backend = "setuptools.backends._legacy:_Backend"
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
asyncio_mode = "auto"
|
||||||
0
libs/ai-services/tests/__init__.py
Normal file
0
libs/ai-services/tests/__init__.py
Normal file
59
libs/ai-services/tests/test_avm.py
Normal file
59
libs/ai-services/tests/test_avm.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from app.main import app
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
def test_predict_heuristic():
|
||||||
|
resp = client.post(
|
||||||
|
"/avm/predict",
|
||||||
|
json={
|
||||||
|
"area": 80.0,
|
||||||
|
"district": "Cầu Giấy",
|
||||||
|
"city": "Hà Nội",
|
||||||
|
"property_type": "apartment",
|
||||||
|
"bedrooms": 2,
|
||||||
|
"bathrooms": 2,
|
||||||
|
"floors": 1,
|
||||||
|
"frontage": 0,
|
||||||
|
"road_width": 0,
|
||||||
|
"has_legal_paper": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["estimated_price_vnd"] > 0
|
||||||
|
assert 0 <= data["confidence"] <= 1
|
||||||
|
assert data["price_per_m2"] > 0
|
||||||
|
assert data["price_range_low"] < data["estimated_price_vnd"]
|
||||||
|
assert data["price_range_high"] > data["estimated_price_vnd"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_predict_validation_error():
|
||||||
|
resp = client.post(
|
||||||
|
"/avm/predict",
|
||||||
|
json={"area": -10, "district": "", "city": "HN", "property_type": "house"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_features():
|
||||||
|
text = "Bán căn hộ chung cư 80m2 3 phòng ngủ 2 WC tầng 10 giá 3.5 tỷ sổ đỏ chính chủ"
|
||||||
|
resp = client.post("/avm/extract-features", json={"text": text})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
features = data["features"]
|
||||||
|
assert features["area"] == 80.0
|
||||||
|
assert features["bedrooms"] == 3
|
||||||
|
assert features["bathrooms"] == 2
|
||||||
|
assert features["property_type"] == "apartment"
|
||||||
|
assert features["has_legal_paper"] is True
|
||||||
|
assert features["price_mentioned"] == 3_500_000_000
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_features_minimal():
|
||||||
|
resp = client.post("/avm/extract-features", json={"text": "Bán nhà riêng"})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["features"]["property_type"] == "house"
|
||||||
12
libs/ai-services/tests/test_health.py
Normal file
12
libs/ai-services/tests/test_health.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from app.main import app
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
def test_health():
|
||||||
|
resp = client.get("/health")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["status"] == "ok"
|
||||||
50
libs/ai-services/tests/test_moderation.py
Normal file
50
libs/ai-services/tests/test_moderation.py
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from app.main import app
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
def test_clean_text():
|
||||||
|
resp = client.post(
|
||||||
|
"/moderation/check",
|
||||||
|
json={"text": "Bán căn hộ đẹp tại quận 1", "context": "listing"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["is_flagged"] is False
|
||||||
|
assert data["score"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_phone_number_flagged():
|
||||||
|
resp = client.post(
|
||||||
|
"/moderation/check",
|
||||||
|
json={"text": "Liên hệ 0912345678 để xem nhà", "context": "listing"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["is_flagged"] is True
|
||||||
|
assert any(f["category"] == "contact_info" for f in data["flags"])
|
||||||
|
assert "[REDACTED]" in data["cleaned_text"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_scam_language_flagged():
|
||||||
|
resp = client.post(
|
||||||
|
"/moderation/check",
|
||||||
|
json={"text": "Cảnh báo lừa đảo từ chủ nhà", "context": "comment"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["is_flagged"] is True
|
||||||
|
assert any(f["category"] == "profanity" for f in data["flags"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_prohibited_property():
|
||||||
|
resp = client.post(
|
||||||
|
"/moderation/check",
|
||||||
|
json={"text": "Bán lô đất rừng phòng hộ 500m2", "context": "listing"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["is_flagged"] is True
|
||||||
|
assert any(f["category"] == "prohibited_content" for f in data["flags"])
|
||||||
Reference in New Issue
Block a user