Implement auto-tagging (amenities, location features, condition/legal), content quality scoring with moderation integration, and FastAPI endpoints for single and batch text analysis. Uses underthesea for Vietnamese tokenization/POS when available, with regex fallback. Co-Authored-By: Paperclip <noreply@paperclip.ing>
49 lines
1.8 KiB
Python
49 lines
1.8 KiB
Python
from pydantic import BaseModel, Field
|
|
|
|
|
|
class NLPAnalyzeRequest(BaseModel):
|
|
text: str = Field(..., min_length=1, description="Vietnamese property description text")
|
|
include_moderation: bool = Field(
|
|
True, description="Whether to include moderation quality score"
|
|
)
|
|
|
|
|
|
class PropertyTag(BaseModel):
|
|
category: str = Field(..., description="Tag category: amenity, location, condition, legal")
|
|
tag: str = Field(..., description="Normalized tag name")
|
|
matched_text: str = Field(..., description="Original text that matched")
|
|
confidence: float = Field(..., ge=0, le=1, description="Match confidence")
|
|
|
|
|
|
class QualityScore(BaseModel):
|
|
overall: float = Field(..., ge=0, le=1, description="Overall content quality 0-1")
|
|
completeness: float = Field(
|
|
..., ge=0, le=1, description="How complete the listing info is"
|
|
)
|
|
readability: float = Field(..., ge=0, le=1, description="Text readability score")
|
|
information_density: float = Field(
|
|
..., ge=0, le=1, description="Ratio of useful info to total text"
|
|
)
|
|
moderation_score: float | None = Field(
|
|
None, ge=0, le=1, description="Moderation risk score (0=safe, 1=risky)"
|
|
)
|
|
|
|
|
|
class NLPAnalyzeResponse(BaseModel):
|
|
tags: list[PropertyTag] = Field(default_factory=list)
|
|
quality: QualityScore
|
|
tokens: list[str] = Field(default_factory=list, description="Word-segmented tokens")
|
|
sentences: list[str] = Field(default_factory=list, description="Sentence-split results")
|
|
keyword_phrases: list[str] = Field(
|
|
default_factory=list, description="Key noun phrases extracted"
|
|
)
|
|
|
|
|
|
class BatchAnalyzeRequest(BaseModel):
|
|
texts: list[str] = Field(..., min_length=1, max_length=50, description="Batch of texts")
|
|
include_moderation: bool = Field(True)
|
|
|
|
|
|
class BatchAnalyzeResponse(BaseModel):
|
|
results: list[NLPAnalyzeResponse]
|