feat(ai-services): add Vietnamese NLP pipeline for property description analysis
Implement auto-tagging (amenities, location features, condition/legal), content quality scoring with moderation integration, and FastAPI endpoints for single and batch text analysis. Uses underthesea for Vietnamese tokenization/POS when available, with regex fallback. Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
48
libs/ai-services/app/models/nlp.py
Normal file
48
libs/ai-services/app/models/nlp.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class NLPAnalyzeRequest(BaseModel):
|
||||
text: str = Field(..., min_length=1, description="Vietnamese property description text")
|
||||
include_moderation: bool = Field(
|
||||
True, description="Whether to include moderation quality score"
|
||||
)
|
||||
|
||||
|
||||
class PropertyTag(BaseModel):
|
||||
category: str = Field(..., description="Tag category: amenity, location, condition, legal")
|
||||
tag: str = Field(..., description="Normalized tag name")
|
||||
matched_text: str = Field(..., description="Original text that matched")
|
||||
confidence: float = Field(..., ge=0, le=1, description="Match confidence")
|
||||
|
||||
|
||||
class QualityScore(BaseModel):
|
||||
overall: float = Field(..., ge=0, le=1, description="Overall content quality 0-1")
|
||||
completeness: float = Field(
|
||||
..., ge=0, le=1, description="How complete the listing info is"
|
||||
)
|
||||
readability: float = Field(..., ge=0, le=1, description="Text readability score")
|
||||
information_density: float = Field(
|
||||
..., ge=0, le=1, description="Ratio of useful info to total text"
|
||||
)
|
||||
moderation_score: float | None = Field(
|
||||
None, ge=0, le=1, description="Moderation risk score (0=safe, 1=risky)"
|
||||
)
|
||||
|
||||
|
||||
class NLPAnalyzeResponse(BaseModel):
|
||||
tags: list[PropertyTag] = Field(default_factory=list)
|
||||
quality: QualityScore
|
||||
tokens: list[str] = Field(default_factory=list, description="Word-segmented tokens")
|
||||
sentences: list[str] = Field(default_factory=list, description="Sentence-split results")
|
||||
keyword_phrases: list[str] = Field(
|
||||
default_factory=list, description="Key noun phrases extracted"
|
||||
)
|
||||
|
||||
|
||||
class BatchAnalyzeRequest(BaseModel):
|
||||
texts: list[str] = Field(..., min_length=1, max_length=50, description="Batch of texts")
|
||||
include_moderation: bool = Field(True)
|
||||
|
||||
|
||||
class BatchAnalyzeResponse(BaseModel):
|
||||
results: list[NLPAnalyzeResponse]
|
||||
Reference in New Issue
Block a user