diff --git a/libs/ai-services/app/main.py b/libs/ai-services/app/main.py index cc27225..cf16d5f 100644 --- a/libs/ai-services/app/main.py +++ b/libs/ai-services/app/main.py @@ -6,7 +6,7 @@ from slowapi.util import get_remote_address from app.config import settings from app.middleware import verify_api_key -from app.routers import avm, moderation +from app.routers import avm, moderation, nlp limiter = Limiter(key_func=get_remote_address, default_limits=[settings.rate_limit]) @@ -33,6 +33,7 @@ app.add_middleware( app.include_router(avm.router) app.include_router(moderation.router) +app.include_router(nlp.router) @app.get("/health") diff --git a/libs/ai-services/app/models/nlp.py b/libs/ai-services/app/models/nlp.py new file mode 100644 index 0000000..5748c2b --- /dev/null +++ b/libs/ai-services/app/models/nlp.py @@ -0,0 +1,48 @@ +from pydantic import BaseModel, Field + + +class NLPAnalyzeRequest(BaseModel): + text: str = Field(..., min_length=1, description="Vietnamese property description text") + include_moderation: bool = Field( + True, description="Whether to include moderation quality score" + ) + + +class PropertyTag(BaseModel): + category: str = Field(..., description="Tag category: amenity, location, condition, legal") + tag: str = Field(..., description="Normalized tag name") + matched_text: str = Field(..., description="Original text that matched") + confidence: float = Field(..., ge=0, le=1, description="Match confidence") + + +class QualityScore(BaseModel): + overall: float = Field(..., ge=0, le=1, description="Overall content quality 0-1") + completeness: float = Field( + ..., ge=0, le=1, description="How complete the listing info is" + ) + readability: float = Field(..., ge=0, le=1, description="Text readability score") + information_density: float = Field( + ..., ge=0, le=1, description="Ratio of useful info to total text" + ) + moderation_score: float | None = Field( + None, ge=0, le=1, description="Moderation risk score (0=safe, 1=risky)" + ) + + +class NLPAnalyzeResponse(BaseModel): + tags: list[PropertyTag] = Field(default_factory=list) + quality: QualityScore + tokens: list[str] = Field(default_factory=list, description="Word-segmented tokens") + sentences: list[str] = Field(default_factory=list, description="Sentence-split results") + keyword_phrases: list[str] = Field( + default_factory=list, description="Key noun phrases extracted" + ) + + +class BatchAnalyzeRequest(BaseModel): + texts: list[str] = Field(..., min_length=1, max_length=50, description="Batch of texts") + include_moderation: bool = Field(True) + + +class BatchAnalyzeResponse(BaseModel): + results: list[NLPAnalyzeResponse] diff --git a/libs/ai-services/app/routers/nlp.py b/libs/ai-services/app/routers/nlp.py new file mode 100644 index 0000000..9fe78d9 --- /dev/null +++ b/libs/ai-services/app/routers/nlp.py @@ -0,0 +1,27 @@ +from fastapi import APIRouter + +from app.models.nlp import ( + BatchAnalyzeRequest, + BatchAnalyzeResponse, + NLPAnalyzeRequest, + NLPAnalyzeResponse, +) +from app.services.nlp_service import nlp_service + +router = APIRouter(prefix="/nlp", tags=["NLP"]) + + +@router.post("/analyze", response_model=NLPAnalyzeResponse) +def analyze(req: NLPAnalyzeRequest) -> NLPAnalyzeResponse: + """Analyze Vietnamese property description: auto-tag, quality score, tokenize.""" + return nlp_service.analyze(req) + + +@router.post("/batch-analyze", response_model=BatchAnalyzeResponse) +def batch_analyze(req: BatchAnalyzeRequest) -> BatchAnalyzeResponse: + """Batch analyze multiple property descriptions.""" + results = [ + nlp_service.analyze(NLPAnalyzeRequest(text=t, include_moderation=req.include_moderation)) + for t in req.texts + ] + return BatchAnalyzeResponse(results=results) diff --git a/libs/ai-services/app/services/nlp_service.py b/libs/ai-services/app/services/nlp_service.py new file mode 100644 index 0000000..1180825 --- /dev/null +++ b/libs/ai-services/app/services/nlp_service.py @@ -0,0 +1,235 @@ +import logging +import re + +from app.models.nlp import ( + NLPAnalyzeRequest, + NLPAnalyzeResponse, + PropertyTag, + QualityScore, +) + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Tag dictionaries — Vietnamese real-estate domain +# --------------------------------------------------------------------------- + +AMENITY_TAGS: dict[str, list[str]] = { + "hồ bơi": ["hồ bơi", "bể bơi", "pool"], + "phòng gym": ["phòng gym", "gym", "phòng tập"], + "sân vườn": ["sân vườn", "vườn", "garden"], + "gara ô tô": ["gara", "garage", "nhà xe", "chỗ đậu xe"], + "thang máy": ["thang máy", "elevator"], + "ban công": ["ban công", "balcony", "lô gia", "logia"], + "sân thượng": ["sân thượng", "rooftop"], + "bảo vệ 24/7": ["bảo vệ 24", "an ninh 24", "security 24"], + "khu vui chơi": ["khu vui chơi", "playground", "sân chơi trẻ em"], + "hầm để xe": ["hầm để xe", "hầm xe", "tầng hầm"], + "điều hòa": ["điều hòa", "máy lạnh"], + "nội thất cao cấp": ["nội thất cao cấp", "full nội thất", "nội thất đầy đủ"], + "camera an ninh": ["camera", "camera an ninh"], + "sân tennis": ["sân tennis", "tennis"], + "công viên nội khu": ["công viên nội khu", "công viên", "park"], +} + +LOCATION_TAGS: dict[str, list[str]] = { + "gần trường học": ["gần trường", "cạnh trường", "kế trường"], + "gần bệnh viện": ["gần bệnh viện", "cạnh bệnh viện", "kế bệnh viện"], + "gần chợ": ["gần chợ", "cạnh chợ", "kế chợ"], + "gần siêu thị": ["gần siêu thị", "cạnh siêu thị", "siêu thị"], + "mặt tiền đường": ["mặt tiền", "mặt đường", "mặt phố"], + "gần metro": ["gần metro", "gần tàu điện", "cạnh metro"], + "gần sân bay": ["gần sân bay", "cạnh sân bay"], + "trung tâm thành phố": ["trung tâm", "trung tâm thành phố", "trung tâm tp"], + "ven sông": ["ven sông", "view sông", "mặt sông"], + "gần biển": ["gần biển", "view biển", "mặt biển", "sát biển"], + "gần công viên": ["gần công viên", "cạnh công viên"], + "khu dân cư": ["khu dân cư", "kdc"], +} + +CONDITION_TAGS: dict[str, list[str]] = { + "mới xây": ["mới xây", "xây mới", "mới hoàn thiện", "vừa xây xong"], + "đã qua sử dụng": ["đã qua sử dụng", "đã sử dụng"], + "cần sửa chữa": ["cần sửa", "cần cải tạo", "cần nâng cấp", "sửa chữa"], + "đang xây dựng": ["đang xây", "đang thi công", "sắp bàn giao"], + "sổ đỏ": ["sổ đỏ"], + "sổ hồng": ["sổ hồng"], + "chính chủ": ["chính chủ"], + "pháp lý rõ ràng": ["pháp lý rõ", "pháp lý đầy đủ", "pháp lý sạch"], + "hoàn thiện cơ bản": ["hoàn thiện cơ bản", "bàn giao thô"], + "đầy đủ nội thất": ["đầy đủ nội thất", "full nội thất", "nội thất đầy đủ"], +} + +# Categories for completeness scoring — which info fields are expected +_COMPLETENESS_FIELDS = [ + r"\d+(?:[.,]\d+)?\s*(?:m2|m²|mét vuông)", # area + r"\d+\s*(?:phòng ngủ|pn|PN)", # bedrooms + r"\d+\s*(?:tầng|lầu)", # floors + r"(?:sổ đỏ|sổ hồng|chính chủ|pháp lý)", # legal + r"\d+(?:[.,]\d+)?\s*(?:tỷ|tỉ|triệu)", # price + r"(?:quận|huyện|phường|xã|đường|tp\.|thành phố)", # location + r"(?:căn hộ|chung cư|nhà phố|biệt thự|đất|shophouse|nhà riêng)", # property type +] + + +class NLPService: + """Vietnamese NLP pipeline for property description analysis.""" + + def _match_tags( + self, + text_lower: str, + tag_dict: dict[str, list[str]], + category: str, + ) -> list[PropertyTag]: + tags: list[PropertyTag] = [] + seen: set[str] = set() + for tag_name, keywords in tag_dict.items(): + for kw in keywords: + idx = text_lower.find(kw) + if idx != -1 and tag_name not in seen: + seen.add(tag_name) + # Extract actual matched text from original-case vicinity + matched = text_lower[idx : idx + len(kw)] + tags.append( + PropertyTag( + category=category, + tag=tag_name, + matched_text=matched, + confidence=0.9 if len(kw) > 3 else 0.75, + ) + ) + return tags + + def _compute_completeness(self, text: str) -> float: + matched = sum( + 1 for pattern in _COMPLETENESS_FIELDS if re.search(pattern, text, re.IGNORECASE) + ) + return round(matched / len(_COMPLETENESS_FIELDS), 3) + + def _compute_readability(self, text: str) -> float: + sentences = [s.strip() for s in re.split(r"[.!?;\n]+", text) if s.strip()] + if not sentences: + return 0.0 + + avg_sentence_len = sum(len(s.split()) for s in sentences) / len(sentences) + + # Penalize very short or very long sentences + if avg_sentence_len < 3: + score = 0.4 + elif avg_sentence_len > 40: + score = 0.5 + else: + # Sweet spot: 8-20 words per sentence + score = 1.0 - abs(avg_sentence_len - 14) / 30 + score = max(0.3, min(1.0, score)) + + # Penalize excessive caps or repeated punctuation + caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1) + if caps_ratio > 0.3: + score *= 0.7 + + return round(score, 3) + + def _compute_info_density(self, text: str, tag_count: int) -> float: + word_count = len(text.split()) + if word_count == 0: + return 0.0 + # More tags per word = higher density, capped at 1.0 + density = min(1.0, (tag_count * 5) / word_count) + return round(density, 3) + + def _tokenize(self, text: str) -> list[str]: + try: + from underthesea import word_tokenize + + return word_tokenize(text) + except ImportError: + logger.warning("underthesea not available — falling back to whitespace split") + return text.split() + + def _sentence_split(self, text: str) -> list[str]: + try: + from underthesea import sent_tokenize + + return sent_tokenize(text) + except ImportError: + return [s.strip() for s in re.split(r"[.!?\n]+", text) if s.strip()] + + def _extract_noun_phrases(self, tokens: list[str], text: str) -> list[str]: + """Extract key noun phrases using POS tagging when available.""" + try: + from underthesea import pos_tag + + tagged = pos_tag(text) + phrases: list[str] = [] + current_phrase: list[str] = [] + + for word, pos in tagged: + if pos in ("N", "Np", "Nc", "Nu", "A"): + current_phrase.append(word) + else: + if len(current_phrase) >= 2: + phrases.append(" ".join(current_phrase)) + current_phrase = [] + + if len(current_phrase) >= 2: + phrases.append(" ".join(current_phrase)) + + return phrases[:20] # Cap at 20 phrases + except ImportError: + return [] + + def analyze(self, req: NLPAnalyzeRequest) -> NLPAnalyzeResponse: + text = req.text + text_lower = text.lower() + + # Auto-tag + amenity_tags = self._match_tags(text_lower, AMENITY_TAGS, "amenity") + location_tags = self._match_tags(text_lower, LOCATION_TAGS, "location") + condition_tags = self._match_tags(text_lower, CONDITION_TAGS, "condition") + all_tags = amenity_tags + location_tags + condition_tags + + # Tokenization + tokens = self._tokenize(text) + sentences = self._sentence_split(text) + keyword_phrases = self._extract_noun_phrases(tokens, text) + + # Quality scoring + completeness = self._compute_completeness(text) + readability = self._compute_readability(text) + info_density = self._compute_info_density(text, len(all_tags)) + + # Moderation integration + moderation_score: float | None = None + if req.include_moderation: + from app.services.moderation_service import moderation_service + from app.models.moderation import ModerationRequest + + mod_result = moderation_service.check(ModerationRequest(text=text, context="listing")) + moderation_score = mod_result.score + + # Overall quality: weighted combination + mod_penalty = (1 - moderation_score) if moderation_score is not None else 1.0 + overall = round( + (completeness * 0.4 + readability * 0.3 + info_density * 0.3) * mod_penalty, + 3, + ) + + quality = QualityScore( + overall=overall, + completeness=completeness, + readability=readability, + information_density=info_density, + moderation_score=moderation_score, + ) + + return NLPAnalyzeResponse( + tags=all_tags, + quality=quality, + tokens=tokens, + sentences=sentences, + keyword_phrases=keyword_phrases, + ) + + +nlp_service = NLPService() diff --git a/libs/ai-services/tests/test_nlp.py b/libs/ai-services/tests/test_nlp.py new file mode 100644 index 0000000..979634b --- /dev/null +++ b/libs/ai-services/tests/test_nlp.py @@ -0,0 +1,119 @@ +from fastapi.testclient import TestClient + +from app.main import app + +client = TestClient(app) + +SAMPLE_LISTING = ( + "Bán căn hộ chung cư cao cấp 85m² tại quận 7, 2 phòng ngủ, 2 WC, " + "3 tầng, nội thất đầy đủ. Có hồ bơi, phòng gym, bảo vệ 24/7. " + "Gần trường học và siêu thị. Sổ hồng chính chủ. Giá 3.5 tỷ." +) + + +def test_analyze_returns_tags(): + resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING}) + assert resp.status_code == 200 + data = resp.json() + + tags = data["tags"] + tag_names = [t["tag"] for t in tags] + + # Amenities + assert "hồ bơi" in tag_names + assert "phòng gym" in tag_names + assert "bảo vệ 24/7" in tag_names + + # Location + assert "gần trường học" in tag_names + assert "gần siêu thị" in tag_names + + # Condition / legal + assert "sổ hồng" in tag_names + assert "chính chủ" in tag_names + + +def test_analyze_quality_scores(): + resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING}) + assert resp.status_code == 200 + quality = resp.json()["quality"] + + assert 0 < quality["overall"] <= 1 + assert 0 < quality["completeness"] <= 1 + assert 0 < quality["readability"] <= 1 + assert 0 < quality["information_density"] <= 1 + assert quality["moderation_score"] is not None + assert quality["moderation_score"] == 0.0 # clean listing + + +def test_analyze_completeness_low_for_sparse_text(): + resp = client.post("/nlp/analyze", json={"text": "Bán nhà đẹp giá tốt"}) + assert resp.status_code == 200 + quality = resp.json()["quality"] + assert quality["completeness"] < 0.3 + + +def test_analyze_tokens_present(): + resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING}) + assert resp.status_code == 200 + data = resp.json() + assert len(data["tokens"]) > 0 + assert len(data["sentences"]) > 0 + + +def test_analyze_no_moderation(): + resp = client.post( + "/nlp/analyze", + json={"text": SAMPLE_LISTING, "include_moderation": False}, + ) + assert resp.status_code == 200 + quality = resp.json()["quality"] + assert quality["moderation_score"] is None + + +def test_analyze_flagged_content_reduces_quality(): + flagged_text = ( + "Bán căn hộ 80m² 2 phòng ngủ quận 1. Liên hệ 0912345678. " + "Sổ hồng chính chủ. Giá 5 tỷ." + ) + resp = client.post("/nlp/analyze", json={"text": flagged_text}) + assert resp.status_code == 200 + quality = resp.json()["quality"] + assert quality["moderation_score"] > 0 # phone number flagged + + +def test_batch_analyze(): + resp = client.post( + "/nlp/batch-analyze", + json={ + "texts": [ + "Bán căn hộ 60m² có hồ bơi gần trường học. Sổ đỏ. 2 tỷ.", + "Bán đất nền 200m² mặt tiền đường lớn. Pháp lý rõ ràng.", + ], + "include_moderation": True, + }, + ) + assert resp.status_code == 200 + data = resp.json() + assert len(data["results"]) == 2 + assert any(t["tag"] == "hồ bơi" for t in data["results"][0]["tags"]) + assert any(t["tag"] == "mặt tiền đường" for t in data["results"][1]["tags"]) + + +def test_analyze_location_tags(): + text = "Căn hộ ven sông Sài Gòn, gần metro số 1, trung tâm thành phố." + resp = client.post("/nlp/analyze", json={"text": text}) + assert resp.status_code == 200 + tag_names = [t["tag"] for t in resp.json()["tags"]] + assert "ven sông" in tag_names + assert "gần metro" in tag_names + assert "trung tâm thành phố" in tag_names + + +def test_analyze_condition_tags(): + text = "Nhà mới xây, hoàn thiện cơ bản, 3 tầng, đang thi công sắp bàn giao." + resp = client.post("/nlp/analyze", json={"text": text}) + assert resp.status_code == 200 + tag_names = [t["tag"] for t in resp.json()["tags"]] + assert "mới xây" in tag_names + assert "hoàn thiện cơ bản" in tag_names