feat(search): configure Typesense for Vietnamese diacritic search
Add normalized (ASCII-only) fields to Typesense schema and indexer so users can search without diacritics (e.g. "can ho" finds "căn hộ"). Create synonym collection for HCMC district abbreviations and common property-type aliases. Enable num_typos:2 for fuzzy matching. - Add 7 normalized fields (title, description, address, ward, district, city, projectName) using Address.normalize() at index time - Search queries both original Vietnamese and normalized field sets - Upsert 28 Vietnamese synonym rules on collection init - Normalize user query to ASCII alongside original for dual matching - Update tests for new fields and synonym upsert behavior Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -30,6 +30,15 @@ export interface ListingDocument {
|
||||
legalStatus: string | null;
|
||||
amenities: string[];
|
||||
isFeatured: number; // 1 if featuredUntil > now, 0 otherwise
|
||||
|
||||
// Vietnamese diacritic-normalized fields for accent-insensitive search
|
||||
titleNormalized: string;
|
||||
descriptionNormalized: string;
|
||||
addressNormalized: string;
|
||||
wardNormalized: string;
|
||||
districtNormalized: string;
|
||||
cityNormalized: string;
|
||||
projectNameNormalized: string | null;
|
||||
}
|
||||
|
||||
export interface SearchResult {
|
||||
|
||||
@@ -160,5 +160,42 @@ describe('ListingIndexerService', () => {
|
||||
expect(result!.priceVND).toBe(5000000000);
|
||||
expect(result!.location).toEqual([10.776, 106.700]);
|
||||
expect(result!.amenities).toEqual(['parking']);
|
||||
|
||||
// Verify normalized fields are populated
|
||||
expect(result!.titleNormalized).toBe('test');
|
||||
expect(result!.descriptionNormalized).toBe('desc');
|
||||
expect(result!.addressNormalized).toBe('123 street');
|
||||
expect(result!.wardNormalized).toBe('ward 1');
|
||||
expect(result!.districtNormalized).toBe('district 1');
|
||||
expect(result!.cityNormalized).toBe('hcmc');
|
||||
expect(result!.projectNameNormalized).toBeNull();
|
||||
});
|
||||
|
||||
it('normalizes Vietnamese diacritics in indexed fields', async () => {
|
||||
const vietnameseListing = {
|
||||
...mockListing,
|
||||
property: {
|
||||
...mockListing.property,
|
||||
title: 'Căn hộ cao cấp',
|
||||
description: 'Biệt thự đẹp',
|
||||
address: '123 Đường Nguyễn Huệ',
|
||||
ward: 'Phường Bến Nghé',
|
||||
district: 'Quận 1',
|
||||
city: 'Hồ Chí Minh',
|
||||
projectName: 'Vinhomes Bason',
|
||||
},
|
||||
};
|
||||
mockPrisma.listing.findUnique.mockResolvedValue(vietnameseListing);
|
||||
mockPrisma.$queryRaw.mockResolvedValue([{ lat: 10.776, lng: 106.700 }]);
|
||||
|
||||
const result = await service.fetchListingDocumentById('listing-1');
|
||||
|
||||
expect(result!.titleNormalized).toBe('can ho cao cap');
|
||||
expect(result!.descriptionNormalized).toBe('biet thu dep');
|
||||
expect(result!.addressNormalized).toBe('123 duong nguyen hue');
|
||||
expect(result!.wardNormalized).toBe('phuong ben nghe');
|
||||
expect(result!.districtNormalized).toBe('quan 1');
|
||||
expect(result!.cityNormalized).toBe('ho chi minh');
|
||||
expect(result!.projectNameNormalized).toBe('vinhomes bason');
|
||||
});
|
||||
});
|
||||
|
||||
@@ -31,6 +31,13 @@ function makeDocument(overrides?: Partial<ListingDocument>): ListingDocument {
|
||||
projectName: null,
|
||||
legalStatus: null,
|
||||
amenities: ['parking'],
|
||||
titleNormalized: 'test apartment',
|
||||
descriptionNormalized: 'a great place',
|
||||
addressNormalized: '123 street',
|
||||
wardNormalized: 'ward 1',
|
||||
districtNormalized: 'district 1',
|
||||
cityNormalized: 'hcmc',
|
||||
projectNameNormalized: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
@@ -44,6 +51,7 @@ describe('TypesenseSearchRepository', () => {
|
||||
retrieve: ReturnType<typeof vi.fn>;
|
||||
delete: ReturnType<typeof vi.fn>;
|
||||
documents: ReturnType<typeof vi.fn>;
|
||||
synonyms: ReturnType<typeof vi.fn>;
|
||||
};
|
||||
let documentOps: {
|
||||
upsert: ReturnType<typeof vi.fn>;
|
||||
@@ -70,6 +78,7 @@ describe('TypesenseSearchRepository', () => {
|
||||
retrieve: vi.fn(),
|
||||
delete: vi.fn().mockResolvedValue({}),
|
||||
documents: vi.fn().mockReturnValue(documentOps),
|
||||
synonyms: vi.fn().mockReturnValue({ upsert: vi.fn().mockResolvedValue({}) }),
|
||||
};
|
||||
createFn = vi.fn().mockResolvedValue({});
|
||||
mockClient = {
|
||||
@@ -193,4 +202,33 @@ describe('TypesenseSearchRepository', () => {
|
||||
expect(searchCall.filter_by).toContain('location:(10.776, 106.7, 5 km)');
|
||||
expect(searchCall.sort_by).toContain('location(10.776, 106.7):asc');
|
||||
});
|
||||
|
||||
it('search queries both original and normalized fields', async () => {
|
||||
documentOps.search.mockResolvedValue({ hits: [], found: 0, search_time_ms: 1 });
|
||||
|
||||
const params: SearchParams = { query: 'căn hộ', page: 1, perPage: 20 };
|
||||
await repo.search(params);
|
||||
|
||||
const searchCall = documentOps.search.mock.calls[0]![0];
|
||||
expect(searchCall.query_by).toContain('titleNormalized');
|
||||
expect(searchCall.query_by).toContain('addressNormalized');
|
||||
expect(searchCall.num_typos).toBe('2');
|
||||
// Query should include both original Vietnamese and normalized ASCII
|
||||
expect(searchCall.q).toContain('căn hộ');
|
||||
expect(searchCall.q).toContain('can ho');
|
||||
});
|
||||
|
||||
it('ensureCollection upserts Vietnamese synonyms', async () => {
|
||||
collectionOps.retrieve.mockResolvedValue({ name: 'listings' });
|
||||
const upsertSpy = vi.fn().mockResolvedValue({});
|
||||
collectionOps.synonyms.mockReturnValue({ upsert: upsertSpy });
|
||||
|
||||
await repo.ensureCollection();
|
||||
|
||||
expect(upsertSpy).toHaveBeenCalled();
|
||||
// Verify at least the HCM synonym was upserted
|
||||
expect(upsertSpy).toHaveBeenCalledWith('hcm', expect.objectContaining({
|
||||
synonyms: expect.arrayContaining(['hcm', 'ho chi minh']),
|
||||
}));
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { Inject, Injectable } from '@nestjs/common';
|
||||
import { Prisma } from '@prisma/client';
|
||||
import { LoggerService, PrismaService } from '@modules/shared';
|
||||
import { Address } from '@modules/listings/domain/value-objects/address.vo';
|
||||
import {
|
||||
SEARCH_REPOSITORY,
|
||||
type ISearchRepository,
|
||||
@@ -124,6 +125,15 @@ export class ListingIndexerService {
|
||||
isFeatured: l.featuredUntil && l.featuredUntil > new Date()
|
||||
? featuredTierWeight(l.featuredPackage as string | null)
|
||||
: 0,
|
||||
|
||||
// Vietnamese diacritic-normalized fields
|
||||
titleNormalized: Address.normalize(p.title),
|
||||
descriptionNormalized: Address.normalize(p.description),
|
||||
addressNormalized: Address.normalize(p.address),
|
||||
wardNormalized: Address.normalize(p.ward),
|
||||
districtNormalized: Address.normalize(p.district),
|
||||
cityNormalized: Address.normalize(p.city),
|
||||
projectNameNormalized: p.projectName ? Address.normalize(p.projectName) : null,
|
||||
};
|
||||
});
|
||||
}
|
||||
@@ -176,6 +186,15 @@ export class ListingIndexerService {
|
||||
isFeatured: listing.featuredUntil && listing.featuredUntil > new Date()
|
||||
? featuredTierWeight(listing.featuredPackage as string | null)
|
||||
: 0,
|
||||
|
||||
// Vietnamese diacritic-normalized fields
|
||||
titleNormalized: Address.normalize(p.title),
|
||||
descriptionNormalized: Address.normalize(p.description),
|
||||
addressNormalized: Address.normalize(p.address),
|
||||
wardNormalized: Address.normalize(p.ward),
|
||||
districtNormalized: Address.normalize(p.district),
|
||||
cityNormalized: Address.normalize(p.city),
|
||||
projectNameNormalized: p.projectName ? Address.normalize(p.projectName) : null,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { type ListingDocument } from '../../domain/repositories/search.repository';
|
||||
import { Address } from '@modules/listings/domain/value-objects/address.vo';
|
||||
|
||||
export interface RawListingRow {
|
||||
listingId: string;
|
||||
@@ -64,5 +65,14 @@ export function mapRowToListingDocument(row: RawListingRow): ListingDocument {
|
||||
legalStatus: row.legalStatus ?? null,
|
||||
amenities: Array.isArray(row.amenities) ? (row.amenities as string[]) : [],
|
||||
isFeatured: row.featuredUntil && new Date(row.featuredUntil) > new Date() ? 1 : 0,
|
||||
|
||||
// Vietnamese diacritic-normalized fields
|
||||
titleNormalized: Address.normalize(row.title),
|
||||
descriptionNormalized: Address.normalize(row.description),
|
||||
addressNormalized: Address.normalize(row.address),
|
||||
wardNormalized: Address.normalize(row.ward),
|
||||
districtNormalized: Address.normalize(row.district),
|
||||
cityNormalized: Address.normalize(row.city),
|
||||
projectNameNormalized: row.projectName ? Address.normalize(row.projectName) : null,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ import { Injectable } from '@nestjs/common';
|
||||
import { type Client as TypesenseClient } from 'typesense';
|
||||
import { type CollectionCreateSchema } from 'typesense/lib/Typesense/Collections';
|
||||
import { LoggerService } from '@modules/shared';
|
||||
import { Address } from '@modules/listings/domain/value-objects/address.vo';
|
||||
import {
|
||||
type ISearchRepository,
|
||||
type ListingDocument,
|
||||
@@ -12,6 +13,41 @@ import { TypesenseClientService } from './typesense-client.service';
|
||||
|
||||
const COLLECTION_NAME = 'listings';
|
||||
|
||||
/**
|
||||
* Vietnamese district abbreviation synonyms — maps common shortened forms
|
||||
* to their full diacritic variants so users can search either way.
|
||||
*/
|
||||
const VIETNAMESE_SYNONYMS: Array<{ id: string; synonyms: string[] }> = [
|
||||
{ id: 'q1', synonyms: ['q1', 'quan 1', 'quận 1', 'q.1'] },
|
||||
{ id: 'q2', synonyms: ['q2', 'quan 2', 'quận 2', 'q.2', 'thu duc', 'thủ đức'] },
|
||||
{ id: 'q3', synonyms: ['q3', 'quan 3', 'quận 3', 'q.3'] },
|
||||
{ id: 'q4', synonyms: ['q4', 'quan 4', 'quận 4', 'q.4'] },
|
||||
{ id: 'q5', synonyms: ['q5', 'quan 5', 'quận 5', 'q.5'] },
|
||||
{ id: 'q6', synonyms: ['q6', 'quan 6', 'quận 6', 'q.6'] },
|
||||
{ id: 'q7', synonyms: ['q7', 'quan 7', 'quận 7', 'q.7'] },
|
||||
{ id: 'q8', synonyms: ['q8', 'quan 8', 'quận 8', 'q.8'] },
|
||||
{ id: 'q9', synonyms: ['q9', 'quan 9', 'quận 9', 'q.9'] },
|
||||
{ id: 'q10', synonyms: ['q10', 'quan 10', 'quận 10', 'q.10'] },
|
||||
{ id: 'q11', synonyms: ['q11', 'quan 11', 'quận 11', 'q.11'] },
|
||||
{ id: 'q12', synonyms: ['q12', 'quan 12', 'quận 12', 'q.12'] },
|
||||
{ id: 'binh-thanh', synonyms: ['binh thanh', 'bình thạnh', 'bt'] },
|
||||
{ id: 'tan-binh', synonyms: ['tan binh', 'tân bình', 'tb'] },
|
||||
{ id: 'tan-phu', synonyms: ['tan phu', 'tân phú', 'tp'] },
|
||||
{ id: 'phu-nhuan', synonyms: ['phu nhuan', 'phú nhuận', 'pn'] },
|
||||
{ id: 'go-vap', synonyms: ['go vap', 'gò vấp', 'gv'] },
|
||||
{ id: 'binh-tan', synonyms: ['binh tan', 'bình tân'] },
|
||||
{ id: 'nha-be', synonyms: ['nha be', 'nhà bè'] },
|
||||
{ id: 'can-gio', synonyms: ['can gio', 'cần giờ'] },
|
||||
{ id: 'cu-chi', synonyms: ['cu chi', 'củ chi'] },
|
||||
{ id: 'hoc-mon', synonyms: ['hoc mon', 'hóc môn'] },
|
||||
{ id: 'binh-chanh', synonyms: ['binh chanh', 'bình chánh'] },
|
||||
{ id: 'can-ho', synonyms: ['can ho', 'căn hộ', 'chung cu', 'chung cư'] },
|
||||
{ id: 'nha-pho', synonyms: ['nha pho', 'nhà phố'] },
|
||||
{ id: 'biet-thu', synonyms: ['biet thu', 'biệt thự'] },
|
||||
{ id: 'dat-nen', synonyms: ['dat nen', 'đất nền'] },
|
||||
{ id: 'hcm', synonyms: ['hcm', 'ho chi minh', 'hồ chí minh', 'tp hcm', 'tphcm', 'sai gon', 'sài gòn'] },
|
||||
];
|
||||
|
||||
const LISTING_SCHEMA: CollectionCreateSchema = {
|
||||
name: COLLECTION_NAME,
|
||||
fields: [
|
||||
@@ -43,6 +79,15 @@ const LISTING_SCHEMA: CollectionCreateSchema = {
|
||||
{ name: 'legalStatus', type: 'string', facet: true, optional: true },
|
||||
{ name: 'amenities', type: 'string[]', facet: true, optional: true },
|
||||
{ name: 'isFeatured', type: 'int32', facet: true },
|
||||
|
||||
// Vietnamese diacritic-normalized fields (ASCII-only, for accent-insensitive search)
|
||||
{ name: 'titleNormalized', type: 'string', facet: false },
|
||||
{ name: 'descriptionNormalized', type: 'string', facet: false },
|
||||
{ name: 'addressNormalized', type: 'string', facet: false },
|
||||
{ name: 'wardNormalized', type: 'string', facet: false },
|
||||
{ name: 'districtNormalized', type: 'string', facet: false },
|
||||
{ name: 'cityNormalized', type: 'string', facet: false },
|
||||
{ name: 'projectNameNormalized', type: 'string', facet: false, optional: true },
|
||||
],
|
||||
token_separators: ['-', '_'],
|
||||
enable_nested_fields: false,
|
||||
@@ -67,6 +112,31 @@ export class TypesenseSearchRepository implements ISearchRepository {
|
||||
await this.client.collections().create(LISTING_SCHEMA);
|
||||
this.logger.log(`Collection "${COLLECTION_NAME}" created`, 'TypesenseSearch');
|
||||
}
|
||||
await this.ensureSynonyms();
|
||||
}
|
||||
|
||||
/**
|
||||
* Upsert Vietnamese district/property-type synonyms into the collection.
|
||||
* Idempotent — safe to call on every startup.
|
||||
*/
|
||||
async ensureSynonyms(): Promise<void> {
|
||||
try {
|
||||
for (const syn of VIETNAMESE_SYNONYMS) {
|
||||
await this.client
|
||||
.collections(COLLECTION_NAME)
|
||||
.synonyms()
|
||||
.upsert(syn.id, { synonyms: syn.synonyms });
|
||||
}
|
||||
this.logger.log(
|
||||
`Upserted ${VIETNAMESE_SYNONYMS.length} Vietnamese synonym rules`,
|
||||
'TypesenseSearch',
|
||||
);
|
||||
} catch (err) {
|
||||
this.logger.warn(
|
||||
`Failed to upsert synonyms: ${err instanceof Error ? err.message : String(err)}`,
|
||||
'TypesenseSearch',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async dropCollection(): Promise<void> {
|
||||
@@ -121,14 +191,23 @@ export class TypesenseSearchRepository implements ISearchRepository {
|
||||
filterBy = filterBy ? `${filterBy} && ${geoFilter}` : geoFilter;
|
||||
}
|
||||
|
||||
const rawQuery = params.query || '*';
|
||||
// For non-wildcard queries, also search the normalized (ASCII) form
|
||||
// so "can ho" matches "căn hộ" via the normalized fields.
|
||||
const normalizedQuery = rawQuery !== '*' ? Address.normalize(rawQuery) : rawQuery;
|
||||
const effectiveQuery = rawQuery !== '*' && normalizedQuery !== rawQuery
|
||||
? `${rawQuery} ${normalizedQuery}`
|
||||
: rawQuery;
|
||||
|
||||
const searchParams = {
|
||||
q: params.query || '*',
|
||||
query_by: 'title,description,address,district,city,projectName',
|
||||
query_by_weights: '5,3,2,2,1,2',
|
||||
q: effectiveQuery,
|
||||
query_by: 'title,description,address,district,city,projectName,titleNormalized,descriptionNormalized,addressNormalized,districtNormalized,cityNormalized,projectNameNormalized',
|
||||
query_by_weights: '5,3,2,2,1,2,5,3,2,2,1,2',
|
||||
filter_by: filterBy,
|
||||
sort_by: this.buildSortBy(params),
|
||||
page,
|
||||
per_page: perPage,
|
||||
num_typos: '2',
|
||||
highlight_full_fields: 'title,description',
|
||||
highlight_start_tag: '<mark>',
|
||||
highlight_end_tag: '</mark>',
|
||||
|
||||
Reference in New Issue
Block a user