Files
Ho Ngoc Hai ee3ae2e81d feat(ai-services): add Vietnamese NLP pipeline for property description analysis
Implement auto-tagging (amenities, location features, condition/legal),
content quality scoring with moderation integration, and FastAPI endpoints
for single and batch text analysis. Uses underthesea for Vietnamese
tokenization/POS when available, with regex fallback.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-08 22:42:31 +07:00

49 lines
1.8 KiB
Python

from pydantic import BaseModel, Field
class NLPAnalyzeRequest(BaseModel):
text: str = Field(..., min_length=1, description="Vietnamese property description text")
include_moderation: bool = Field(
True, description="Whether to include moderation quality score"
)
class PropertyTag(BaseModel):
category: str = Field(..., description="Tag category: amenity, location, condition, legal")
tag: str = Field(..., description="Normalized tag name")
matched_text: str = Field(..., description="Original text that matched")
confidence: float = Field(..., ge=0, le=1, description="Match confidence")
class QualityScore(BaseModel):
overall: float = Field(..., ge=0, le=1, description="Overall content quality 0-1")
completeness: float = Field(
..., ge=0, le=1, description="How complete the listing info is"
)
readability: float = Field(..., ge=0, le=1, description="Text readability score")
information_density: float = Field(
..., ge=0, le=1, description="Ratio of useful info to total text"
)
moderation_score: float | None = Field(
None, ge=0, le=1, description="Moderation risk score (0=safe, 1=risky)"
)
class NLPAnalyzeResponse(BaseModel):
tags: list[PropertyTag] = Field(default_factory=list)
quality: QualityScore
tokens: list[str] = Field(default_factory=list, description="Word-segmented tokens")
sentences: list[str] = Field(default_factory=list, description="Sentence-split results")
keyword_phrases: list[str] = Field(
default_factory=list, description="Key noun phrases extracted"
)
class BatchAnalyzeRequest(BaseModel):
texts: list[str] = Field(..., min_length=1, max_length=50, description="Batch of texts")
include_moderation: bool = Field(True)
class BatchAnalyzeResponse(BaseModel):
results: list[NLPAnalyzeResponse]