feat(ai-services): add Vietnamese NLP pipeline for property description analysis

Implement auto-tagging (amenities, location features, condition/legal),
content quality scoring with moderation integration, and FastAPI endpoints
for single and batch text analysis. Uses underthesea for Vietnamese
tokenization/POS when available, with regex fallback.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Ho Ngoc Hai
2026-04-08 22:42:31 +07:00
parent 944d6262e7
commit ee3ae2e81d
5 changed files with 431 additions and 1 deletions

View File

@@ -6,7 +6,7 @@ from slowapi.util import get_remote_address
from app.config import settings
from app.middleware import verify_api_key
from app.routers import avm, moderation
from app.routers import avm, moderation, nlp
limiter = Limiter(key_func=get_remote_address, default_limits=[settings.rate_limit])
@@ -33,6 +33,7 @@ app.add_middleware(
app.include_router(avm.router)
app.include_router(moderation.router)
app.include_router(nlp.router)
@app.get("/health")

View File

@@ -0,0 +1,48 @@
from pydantic import BaseModel, Field
class NLPAnalyzeRequest(BaseModel):
text: str = Field(..., min_length=1, description="Vietnamese property description text")
include_moderation: bool = Field(
True, description="Whether to include moderation quality score"
)
class PropertyTag(BaseModel):
category: str = Field(..., description="Tag category: amenity, location, condition, legal")
tag: str = Field(..., description="Normalized tag name")
matched_text: str = Field(..., description="Original text that matched")
confidence: float = Field(..., ge=0, le=1, description="Match confidence")
class QualityScore(BaseModel):
overall: float = Field(..., ge=0, le=1, description="Overall content quality 0-1")
completeness: float = Field(
..., ge=0, le=1, description="How complete the listing info is"
)
readability: float = Field(..., ge=0, le=1, description="Text readability score")
information_density: float = Field(
..., ge=0, le=1, description="Ratio of useful info to total text"
)
moderation_score: float | None = Field(
None, ge=0, le=1, description="Moderation risk score (0=safe, 1=risky)"
)
class NLPAnalyzeResponse(BaseModel):
tags: list[PropertyTag] = Field(default_factory=list)
quality: QualityScore
tokens: list[str] = Field(default_factory=list, description="Word-segmented tokens")
sentences: list[str] = Field(default_factory=list, description="Sentence-split results")
keyword_phrases: list[str] = Field(
default_factory=list, description="Key noun phrases extracted"
)
class BatchAnalyzeRequest(BaseModel):
texts: list[str] = Field(..., min_length=1, max_length=50, description="Batch of texts")
include_moderation: bool = Field(True)
class BatchAnalyzeResponse(BaseModel):
results: list[NLPAnalyzeResponse]

View File

@@ -0,0 +1,27 @@
from fastapi import APIRouter
from app.models.nlp import (
BatchAnalyzeRequest,
BatchAnalyzeResponse,
NLPAnalyzeRequest,
NLPAnalyzeResponse,
)
from app.services.nlp_service import nlp_service
router = APIRouter(prefix="/nlp", tags=["NLP"])
@router.post("/analyze", response_model=NLPAnalyzeResponse)
def analyze(req: NLPAnalyzeRequest) -> NLPAnalyzeResponse:
"""Analyze Vietnamese property description: auto-tag, quality score, tokenize."""
return nlp_service.analyze(req)
@router.post("/batch-analyze", response_model=BatchAnalyzeResponse)
def batch_analyze(req: BatchAnalyzeRequest) -> BatchAnalyzeResponse:
"""Batch analyze multiple property descriptions."""
results = [
nlp_service.analyze(NLPAnalyzeRequest(text=t, include_moderation=req.include_moderation))
for t in req.texts
]
return BatchAnalyzeResponse(results=results)

View File

@@ -0,0 +1,235 @@
import logging
import re
from app.models.nlp import (
NLPAnalyzeRequest,
NLPAnalyzeResponse,
PropertyTag,
QualityScore,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Tag dictionaries — Vietnamese real-estate domain
# ---------------------------------------------------------------------------
AMENITY_TAGS: dict[str, list[str]] = {
"hồ bơi": ["hồ bơi", "bể bơi", "pool"],
"phòng gym": ["phòng gym", "gym", "phòng tập"],
"sân vườn": ["sân vườn", "vườn", "garden"],
"gara ô tô": ["gara", "garage", "nhà xe", "chỗ đậu xe"],
"thang máy": ["thang máy", "elevator"],
"ban công": ["ban công", "balcony", "lô gia", "logia"],
"sân thượng": ["sân thượng", "rooftop"],
"bảo vệ 24/7": ["bảo vệ 24", "an ninh 24", "security 24"],
"khu vui chơi": ["khu vui chơi", "playground", "sân chơi trẻ em"],
"hầm để xe": ["hầm để xe", "hầm xe", "tầng hầm"],
"điều hòa": ["điều hòa", "máy lạnh"],
"nội thất cao cấp": ["nội thất cao cấp", "full nội thất", "nội thất đầy đủ"],
"camera an ninh": ["camera", "camera an ninh"],
"sân tennis": ["sân tennis", "tennis"],
"công viên nội khu": ["công viên nội khu", "công viên", "park"],
}
LOCATION_TAGS: dict[str, list[str]] = {
"gần trường học": ["gần trường", "cạnh trường", "kế trường"],
"gần bệnh viện": ["gần bệnh viện", "cạnh bệnh viện", "kế bệnh viện"],
"gần chợ": ["gần chợ", "cạnh chợ", "kế chợ"],
"gần siêu thị": ["gần siêu thị", "cạnh siêu thị", "siêu thị"],
"mặt tiền đường": ["mặt tiền", "mặt đường", "mặt phố"],
"gần metro": ["gần metro", "gần tàu điện", "cạnh metro"],
"gần sân bay": ["gần sân bay", "cạnh sân bay"],
"trung tâm thành phố": ["trung tâm", "trung tâm thành phố", "trung tâm tp"],
"ven sông": ["ven sông", "view sông", "mặt sông"],
"gần biển": ["gần biển", "view biển", "mặt biển", "sát biển"],
"gần công viên": ["gần công viên", "cạnh công viên"],
"khu dân cư": ["khu dân cư", "kdc"],
}
CONDITION_TAGS: dict[str, list[str]] = {
"mới xây": ["mới xây", "xây mới", "mới hoàn thiện", "vừa xây xong"],
"đã qua sử dụng": ["đã qua sử dụng", "đã sử dụng"],
"cần sửa chữa": ["cần sửa", "cần cải tạo", "cần nâng cấp", "sửa chữa"],
"đang xây dựng": ["đang xây", "đang thi công", "sắp bàn giao"],
"sổ đỏ": ["sổ đỏ"],
"sổ hồng": ["sổ hồng"],
"chính chủ": ["chính chủ"],
"pháp lý rõ ràng": ["pháp lý rõ", "pháp lý đầy đủ", "pháp lý sạch"],
"hoàn thiện cơ bản": ["hoàn thiện cơ bản", "bàn giao thô"],
"đầy đủ nội thất": ["đầy đủ nội thất", "full nội thất", "nội thất đầy đủ"],
}
# Categories for completeness scoring — which info fields are expected
_COMPLETENESS_FIELDS = [
r"\d+(?:[.,]\d+)?\s*(?:m2|m²|mét vuông)", # area
r"\d+\s*(?:phòng ngủ|pn|PN)", # bedrooms
r"\d+\s*(?:tầng|lầu)", # floors
r"(?:sổ đỏ|sổ hồng|chính chủ|pháp lý)", # legal
r"\d+(?:[.,]\d+)?\s*(?:tỷ|tỉ|triệu)", # price
r"(?:quận|huyện|phường|xã|đường|tp\.|thành phố)", # location
r"(?:căn hộ|chung cư|nhà phố|biệt thự|đất|shophouse|nhà riêng)", # property type
]
class NLPService:
"""Vietnamese NLP pipeline for property description analysis."""
def _match_tags(
self,
text_lower: str,
tag_dict: dict[str, list[str]],
category: str,
) -> list[PropertyTag]:
tags: list[PropertyTag] = []
seen: set[str] = set()
for tag_name, keywords in tag_dict.items():
for kw in keywords:
idx = text_lower.find(kw)
if idx != -1 and tag_name not in seen:
seen.add(tag_name)
# Extract actual matched text from original-case vicinity
matched = text_lower[idx : idx + len(kw)]
tags.append(
PropertyTag(
category=category,
tag=tag_name,
matched_text=matched,
confidence=0.9 if len(kw) > 3 else 0.75,
)
)
return tags
def _compute_completeness(self, text: str) -> float:
matched = sum(
1 for pattern in _COMPLETENESS_FIELDS if re.search(pattern, text, re.IGNORECASE)
)
return round(matched / len(_COMPLETENESS_FIELDS), 3)
def _compute_readability(self, text: str) -> float:
sentences = [s.strip() for s in re.split(r"[.!?;\n]+", text) if s.strip()]
if not sentences:
return 0.0
avg_sentence_len = sum(len(s.split()) for s in sentences) / len(sentences)
# Penalize very short or very long sentences
if avg_sentence_len < 3:
score = 0.4
elif avg_sentence_len > 40:
score = 0.5
else:
# Sweet spot: 8-20 words per sentence
score = 1.0 - abs(avg_sentence_len - 14) / 30
score = max(0.3, min(1.0, score))
# Penalize excessive caps or repeated punctuation
caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
if caps_ratio > 0.3:
score *= 0.7
return round(score, 3)
def _compute_info_density(self, text: str, tag_count: int) -> float:
word_count = len(text.split())
if word_count == 0:
return 0.0
# More tags per word = higher density, capped at 1.0
density = min(1.0, (tag_count * 5) / word_count)
return round(density, 3)
def _tokenize(self, text: str) -> list[str]:
try:
from underthesea import word_tokenize
return word_tokenize(text)
except ImportError:
logger.warning("underthesea not available — falling back to whitespace split")
return text.split()
def _sentence_split(self, text: str) -> list[str]:
try:
from underthesea import sent_tokenize
return sent_tokenize(text)
except ImportError:
return [s.strip() for s in re.split(r"[.!?\n]+", text) if s.strip()]
def _extract_noun_phrases(self, tokens: list[str], text: str) -> list[str]:
"""Extract key noun phrases using POS tagging when available."""
try:
from underthesea import pos_tag
tagged = pos_tag(text)
phrases: list[str] = []
current_phrase: list[str] = []
for word, pos in tagged:
if pos in ("N", "Np", "Nc", "Nu", "A"):
current_phrase.append(word)
else:
if len(current_phrase) >= 2:
phrases.append(" ".join(current_phrase))
current_phrase = []
if len(current_phrase) >= 2:
phrases.append(" ".join(current_phrase))
return phrases[:20] # Cap at 20 phrases
except ImportError:
return []
def analyze(self, req: NLPAnalyzeRequest) -> NLPAnalyzeResponse:
text = req.text
text_lower = text.lower()
# Auto-tag
amenity_tags = self._match_tags(text_lower, AMENITY_TAGS, "amenity")
location_tags = self._match_tags(text_lower, LOCATION_TAGS, "location")
condition_tags = self._match_tags(text_lower, CONDITION_TAGS, "condition")
all_tags = amenity_tags + location_tags + condition_tags
# Tokenization
tokens = self._tokenize(text)
sentences = self._sentence_split(text)
keyword_phrases = self._extract_noun_phrases(tokens, text)
# Quality scoring
completeness = self._compute_completeness(text)
readability = self._compute_readability(text)
info_density = self._compute_info_density(text, len(all_tags))
# Moderation integration
moderation_score: float | None = None
if req.include_moderation:
from app.services.moderation_service import moderation_service
from app.models.moderation import ModerationRequest
mod_result = moderation_service.check(ModerationRequest(text=text, context="listing"))
moderation_score = mod_result.score
# Overall quality: weighted combination
mod_penalty = (1 - moderation_score) if moderation_score is not None else 1.0
overall = round(
(completeness * 0.4 + readability * 0.3 + info_density * 0.3) * mod_penalty,
3,
)
quality = QualityScore(
overall=overall,
completeness=completeness,
readability=readability,
information_density=info_density,
moderation_score=moderation_score,
)
return NLPAnalyzeResponse(
tags=all_tags,
quality=quality,
tokens=tokens,
sentences=sentences,
keyword_phrases=keyword_phrases,
)
nlp_service = NLPService()

View File

@@ -0,0 +1,119 @@
from fastapi.testclient import TestClient
from app.main import app
client = TestClient(app)
SAMPLE_LISTING = (
"Bán căn hộ chung cư cao cấp 85m² tại quận 7, 2 phòng ngủ, 2 WC, "
"3 tầng, nội thất đầy đủ. Có hồ bơi, phòng gym, bảo vệ 24/7. "
"Gần trường học và siêu thị. Sổ hồng chính chủ. Giá 3.5 tỷ."
)
def test_analyze_returns_tags():
resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING})
assert resp.status_code == 200
data = resp.json()
tags = data["tags"]
tag_names = [t["tag"] for t in tags]
# Amenities
assert "hồ bơi" in tag_names
assert "phòng gym" in tag_names
assert "bảo vệ 24/7" in tag_names
# Location
assert "gần trường học" in tag_names
assert "gần siêu thị" in tag_names
# Condition / legal
assert "sổ hồng" in tag_names
assert "chính chủ" in tag_names
def test_analyze_quality_scores():
resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING})
assert resp.status_code == 200
quality = resp.json()["quality"]
assert 0 < quality["overall"] <= 1
assert 0 < quality["completeness"] <= 1
assert 0 < quality["readability"] <= 1
assert 0 < quality["information_density"] <= 1
assert quality["moderation_score"] is not None
assert quality["moderation_score"] == 0.0 # clean listing
def test_analyze_completeness_low_for_sparse_text():
resp = client.post("/nlp/analyze", json={"text": "Bán nhà đẹp giá tốt"})
assert resp.status_code == 200
quality = resp.json()["quality"]
assert quality["completeness"] < 0.3
def test_analyze_tokens_present():
resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING})
assert resp.status_code == 200
data = resp.json()
assert len(data["tokens"]) > 0
assert len(data["sentences"]) > 0
def test_analyze_no_moderation():
resp = client.post(
"/nlp/analyze",
json={"text": SAMPLE_LISTING, "include_moderation": False},
)
assert resp.status_code == 200
quality = resp.json()["quality"]
assert quality["moderation_score"] is None
def test_analyze_flagged_content_reduces_quality():
flagged_text = (
"Bán căn hộ 80m² 2 phòng ngủ quận 1. Liên hệ 0912345678. "
"Sổ hồng chính chủ. Giá 5 tỷ."
)
resp = client.post("/nlp/analyze", json={"text": flagged_text})
assert resp.status_code == 200
quality = resp.json()["quality"]
assert quality["moderation_score"] > 0 # phone number flagged
def test_batch_analyze():
resp = client.post(
"/nlp/batch-analyze",
json={
"texts": [
"Bán căn hộ 60m² có hồ bơi gần trường học. Sổ đỏ. 2 tỷ.",
"Bán đất nền 200m² mặt tiền đường lớn. Pháp lý rõ ràng.",
],
"include_moderation": True,
},
)
assert resp.status_code == 200
data = resp.json()
assert len(data["results"]) == 2
assert any(t["tag"] == "hồ bơi" for t in data["results"][0]["tags"])
assert any(t["tag"] == "mặt tiền đường" for t in data["results"][1]["tags"])
def test_analyze_location_tags():
text = "Căn hộ ven sông Sài Gòn, gần metro số 1, trung tâm thành phố."
resp = client.post("/nlp/analyze", json={"text": text})
assert resp.status_code == 200
tag_names = [t["tag"] for t in resp.json()["tags"]]
assert "ven sông" in tag_names
assert "gần metro" in tag_names
assert "trung tâm thành phố" in tag_names
def test_analyze_condition_tags():
text = "Nhà mới xây, hoàn thiện cơ bản, 3 tầng, đang thi công sắp bàn giao."
resp = client.post("/nlp/analyze", json={"text": text})
assert resp.status_code == 200
tag_names = [t["tag"] for t in resp.json()["tags"]]
assert "mới xây" in tag_names
assert "hoàn thiện cơ bản" in tag_names