Files
goodgo-platform/libs/ai-services/tests/test_nlp.py
Ho Ngoc Hai ee3ae2e81d feat(ai-services): add Vietnamese NLP pipeline for property description analysis
Implement auto-tagging (amenities, location features, condition/legal),
content quality scoring with moderation integration, and FastAPI endpoints
for single and batch text analysis. Uses underthesea for Vietnamese
tokenization/POS when available, with regex fallback.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-08 22:42:31 +07:00

120 lines
3.9 KiB
Python

from fastapi.testclient import TestClient
from app.main import app
client = TestClient(app)
SAMPLE_LISTING = (
"Bán căn hộ chung cư cao cấp 85m² tại quận 7, 2 phòng ngủ, 2 WC, "
"3 tầng, nội thất đầy đủ. Có hồ bơi, phòng gym, bảo vệ 24/7. "
"Gần trường học và siêu thị. Sổ hồng chính chủ. Giá 3.5 tỷ."
)
def test_analyze_returns_tags():
resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING})
assert resp.status_code == 200
data = resp.json()
tags = data["tags"]
tag_names = [t["tag"] for t in tags]
# Amenities
assert "hồ bơi" in tag_names
assert "phòng gym" in tag_names
assert "bảo vệ 24/7" in tag_names
# Location
assert "gần trường học" in tag_names
assert "gần siêu thị" in tag_names
# Condition / legal
assert "sổ hồng" in tag_names
assert "chính chủ" in tag_names
def test_analyze_quality_scores():
resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING})
assert resp.status_code == 200
quality = resp.json()["quality"]
assert 0 < quality["overall"] <= 1
assert 0 < quality["completeness"] <= 1
assert 0 < quality["readability"] <= 1
assert 0 < quality["information_density"] <= 1
assert quality["moderation_score"] is not None
assert quality["moderation_score"] == 0.0 # clean listing
def test_analyze_completeness_low_for_sparse_text():
resp = client.post("/nlp/analyze", json={"text": "Bán nhà đẹp giá tốt"})
assert resp.status_code == 200
quality = resp.json()["quality"]
assert quality["completeness"] < 0.3
def test_analyze_tokens_present():
resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING})
assert resp.status_code == 200
data = resp.json()
assert len(data["tokens"]) > 0
assert len(data["sentences"]) > 0
def test_analyze_no_moderation():
resp = client.post(
"/nlp/analyze",
json={"text": SAMPLE_LISTING, "include_moderation": False},
)
assert resp.status_code == 200
quality = resp.json()["quality"]
assert quality["moderation_score"] is None
def test_analyze_flagged_content_reduces_quality():
flagged_text = (
"Bán căn hộ 80m² 2 phòng ngủ quận 1. Liên hệ 0912345678. "
"Sổ hồng chính chủ. Giá 5 tỷ."
)
resp = client.post("/nlp/analyze", json={"text": flagged_text})
assert resp.status_code == 200
quality = resp.json()["quality"]
assert quality["moderation_score"] > 0 # phone number flagged
def test_batch_analyze():
resp = client.post(
"/nlp/batch-analyze",
json={
"texts": [
"Bán căn hộ 60m² có hồ bơi gần trường học. Sổ đỏ. 2 tỷ.",
"Bán đất nền 200m² mặt tiền đường lớn. Pháp lý rõ ràng.",
],
"include_moderation": True,
},
)
assert resp.status_code == 200
data = resp.json()
assert len(data["results"]) == 2
assert any(t["tag"] == "hồ bơi" for t in data["results"][0]["tags"])
assert any(t["tag"] == "mặt tiền đường" for t in data["results"][1]["tags"])
def test_analyze_location_tags():
text = "Căn hộ ven sông Sài Gòn, gần metro số 1, trung tâm thành phố."
resp = client.post("/nlp/analyze", json={"text": text})
assert resp.status_code == 200
tag_names = [t["tag"] for t in resp.json()["tags"]]
assert "ven sông" in tag_names
assert "gần metro" in tag_names
assert "trung tâm thành phố" in tag_names
def test_analyze_condition_tags():
text = "Nhà mới xây, hoàn thiện cơ bản, 3 tầng, đang thi công sắp bàn giao."
resp = client.post("/nlp/analyze", json={"text": text})
assert resp.status_code == 200
tag_names = [t["tag"] for t in resp.json()["tags"]]
assert "mới xây" in tag_names
assert "hoàn thiện cơ bản" in tag_names