feat(search): configure Typesense for Vietnamese diacritic search

Add normalized (ASCII-only) fields to Typesense schema and indexer so
users can search without diacritics (e.g. "can ho" finds "căn hộ").
Create synonym collection for HCMC district abbreviations and common
property-type aliases. Enable num_typos:2 for fuzzy matching.

- Add 7 normalized fields (title, description, address, ward, district,
  city, projectName) using Address.normalize() at index time
- Search queries both original Vietnamese and normalized field sets
- Upsert 28 Vietnamese synonym rules on collection init
- Normalize user query to ASCII alongside original for dual matching
- Update tests for new fields and synonym upsert behavior

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Ho Ngoc Hai
2026-04-23 00:41:14 +07:00
parent 36a9b00cf1
commit 7a854373b3
6 changed files with 195 additions and 3 deletions

View File

@@ -30,6 +30,15 @@ export interface ListingDocument {
legalStatus: string | null;
amenities: string[];
isFeatured: number; // 1 if featuredUntil > now, 0 otherwise
// Vietnamese diacritic-normalized fields for accent-insensitive search
titleNormalized: string;
descriptionNormalized: string;
addressNormalized: string;
wardNormalized: string;
districtNormalized: string;
cityNormalized: string;
projectNameNormalized: string | null;
}
export interface SearchResult {

View File

@@ -160,5 +160,42 @@ describe('ListingIndexerService', () => {
expect(result!.priceVND).toBe(5000000000);
expect(result!.location).toEqual([10.776, 106.700]);
expect(result!.amenities).toEqual(['parking']);
// Verify normalized fields are populated
expect(result!.titleNormalized).toBe('test');
expect(result!.descriptionNormalized).toBe('desc');
expect(result!.addressNormalized).toBe('123 street');
expect(result!.wardNormalized).toBe('ward 1');
expect(result!.districtNormalized).toBe('district 1');
expect(result!.cityNormalized).toBe('hcmc');
expect(result!.projectNameNormalized).toBeNull();
});
it('normalizes Vietnamese diacritics in indexed fields', async () => {
const vietnameseListing = {
...mockListing,
property: {
...mockListing.property,
title: 'Căn hộ cao cấp',
description: 'Biệt thự đẹp',
address: '123 Đường Nguyễn Huệ',
ward: 'Phường Bến Nghé',
district: 'Quận 1',
city: 'Hồ Chí Minh',
projectName: 'Vinhomes Bason',
},
};
mockPrisma.listing.findUnique.mockResolvedValue(vietnameseListing);
mockPrisma.$queryRaw.mockResolvedValue([{ lat: 10.776, lng: 106.700 }]);
const result = await service.fetchListingDocumentById('listing-1');
expect(result!.titleNormalized).toBe('can ho cao cap');
expect(result!.descriptionNormalized).toBe('biet thu dep');
expect(result!.addressNormalized).toBe('123 duong nguyen hue');
expect(result!.wardNormalized).toBe('phuong ben nghe');
expect(result!.districtNormalized).toBe('quan 1');
expect(result!.cityNormalized).toBe('ho chi minh');
expect(result!.projectNameNormalized).toBe('vinhomes bason');
});
});

View File

@@ -31,6 +31,13 @@ function makeDocument(overrides?: Partial<ListingDocument>): ListingDocument {
projectName: null,
legalStatus: null,
amenities: ['parking'],
titleNormalized: 'test apartment',
descriptionNormalized: 'a great place',
addressNormalized: '123 street',
wardNormalized: 'ward 1',
districtNormalized: 'district 1',
cityNormalized: 'hcmc',
projectNameNormalized: null,
...overrides,
};
}
@@ -44,6 +51,7 @@ describe('TypesenseSearchRepository', () => {
retrieve: ReturnType<typeof vi.fn>;
delete: ReturnType<typeof vi.fn>;
documents: ReturnType<typeof vi.fn>;
synonyms: ReturnType<typeof vi.fn>;
};
let documentOps: {
upsert: ReturnType<typeof vi.fn>;
@@ -70,6 +78,7 @@ describe('TypesenseSearchRepository', () => {
retrieve: vi.fn(),
delete: vi.fn().mockResolvedValue({}),
documents: vi.fn().mockReturnValue(documentOps),
synonyms: vi.fn().mockReturnValue({ upsert: vi.fn().mockResolvedValue({}) }),
};
createFn = vi.fn().mockResolvedValue({});
mockClient = {
@@ -193,4 +202,33 @@ describe('TypesenseSearchRepository', () => {
expect(searchCall.filter_by).toContain('location:(10.776, 106.7, 5 km)');
expect(searchCall.sort_by).toContain('location(10.776, 106.7):asc');
});
it('search queries both original and normalized fields', async () => {
documentOps.search.mockResolvedValue({ hits: [], found: 0, search_time_ms: 1 });
const params: SearchParams = { query: 'căn hộ', page: 1, perPage: 20 };
await repo.search(params);
const searchCall = documentOps.search.mock.calls[0]![0];
expect(searchCall.query_by).toContain('titleNormalized');
expect(searchCall.query_by).toContain('addressNormalized');
expect(searchCall.num_typos).toBe('2');
// Query should include both original Vietnamese and normalized ASCII
expect(searchCall.q).toContain('căn hộ');
expect(searchCall.q).toContain('can ho');
});
it('ensureCollection upserts Vietnamese synonyms', async () => {
collectionOps.retrieve.mockResolvedValue({ name: 'listings' });
const upsertSpy = vi.fn().mockResolvedValue({});
collectionOps.synonyms.mockReturnValue({ upsert: upsertSpy });
await repo.ensureCollection();
expect(upsertSpy).toHaveBeenCalled();
// Verify at least the HCM synonym was upserted
expect(upsertSpy).toHaveBeenCalledWith('hcm', expect.objectContaining({
synonyms: expect.arrayContaining(['hcm', 'ho chi minh']),
}));
});
});

View File

@@ -1,6 +1,7 @@
import { Inject, Injectable } from '@nestjs/common';
import { Prisma } from '@prisma/client';
import { LoggerService, PrismaService } from '@modules/shared';
import { Address } from '@modules/listings/domain/value-objects/address.vo';
import {
SEARCH_REPOSITORY,
type ISearchRepository,
@@ -124,6 +125,15 @@ export class ListingIndexerService {
isFeatured: l.featuredUntil && l.featuredUntil > new Date()
? featuredTierWeight(l.featuredPackage as string | null)
: 0,
// Vietnamese diacritic-normalized fields
titleNormalized: Address.normalize(p.title),
descriptionNormalized: Address.normalize(p.description),
addressNormalized: Address.normalize(p.address),
wardNormalized: Address.normalize(p.ward),
districtNormalized: Address.normalize(p.district),
cityNormalized: Address.normalize(p.city),
projectNameNormalized: p.projectName ? Address.normalize(p.projectName) : null,
};
});
}
@@ -176,6 +186,15 @@ export class ListingIndexerService {
isFeatured: listing.featuredUntil && listing.featuredUntil > new Date()
? featuredTierWeight(listing.featuredPackage as string | null)
: 0,
// Vietnamese diacritic-normalized fields
titleNormalized: Address.normalize(p.title),
descriptionNormalized: Address.normalize(p.description),
addressNormalized: Address.normalize(p.address),
wardNormalized: Address.normalize(p.ward),
districtNormalized: Address.normalize(p.district),
cityNormalized: Address.normalize(p.city),
projectNameNormalized: p.projectName ? Address.normalize(p.projectName) : null,
};
}

View File

@@ -1,4 +1,5 @@
import { type ListingDocument } from '../../domain/repositories/search.repository';
import { Address } from '@modules/listings/domain/value-objects/address.vo';
export interface RawListingRow {
listingId: string;
@@ -64,5 +65,14 @@ export function mapRowToListingDocument(row: RawListingRow): ListingDocument {
legalStatus: row.legalStatus ?? null,
amenities: Array.isArray(row.amenities) ? (row.amenities as string[]) : [],
isFeatured: row.featuredUntil && new Date(row.featuredUntil) > new Date() ? 1 : 0,
// Vietnamese diacritic-normalized fields
titleNormalized: Address.normalize(row.title),
descriptionNormalized: Address.normalize(row.description),
addressNormalized: Address.normalize(row.address),
wardNormalized: Address.normalize(row.ward),
districtNormalized: Address.normalize(row.district),
cityNormalized: Address.normalize(row.city),
projectNameNormalized: row.projectName ? Address.normalize(row.projectName) : null,
};
}

View File

@@ -2,6 +2,7 @@ import { Injectable } from '@nestjs/common';
import { type Client as TypesenseClient } from 'typesense';
import { type CollectionCreateSchema } from 'typesense/lib/Typesense/Collections';
import { LoggerService } from '@modules/shared';
import { Address } from '@modules/listings/domain/value-objects/address.vo';
import {
type ISearchRepository,
type ListingDocument,
@@ -12,6 +13,41 @@ import { TypesenseClientService } from './typesense-client.service';
const COLLECTION_NAME = 'listings';
/**
* Vietnamese district abbreviation synonyms — maps common shortened forms
* to their full diacritic variants so users can search either way.
*/
const VIETNAMESE_SYNONYMS: Array<{ id: string; synonyms: string[] }> = [
{ id: 'q1', synonyms: ['q1', 'quan 1', 'quận 1', 'q.1'] },
{ id: 'q2', synonyms: ['q2', 'quan 2', 'quận 2', 'q.2', 'thu duc', 'thủ đức'] },
{ id: 'q3', synonyms: ['q3', 'quan 3', 'quận 3', 'q.3'] },
{ id: 'q4', synonyms: ['q4', 'quan 4', 'quận 4', 'q.4'] },
{ id: 'q5', synonyms: ['q5', 'quan 5', 'quận 5', 'q.5'] },
{ id: 'q6', synonyms: ['q6', 'quan 6', 'quận 6', 'q.6'] },
{ id: 'q7', synonyms: ['q7', 'quan 7', 'quận 7', 'q.7'] },
{ id: 'q8', synonyms: ['q8', 'quan 8', 'quận 8', 'q.8'] },
{ id: 'q9', synonyms: ['q9', 'quan 9', 'quận 9', 'q.9'] },
{ id: 'q10', synonyms: ['q10', 'quan 10', 'quận 10', 'q.10'] },
{ id: 'q11', synonyms: ['q11', 'quan 11', 'quận 11', 'q.11'] },
{ id: 'q12', synonyms: ['q12', 'quan 12', 'quận 12', 'q.12'] },
{ id: 'binh-thanh', synonyms: ['binh thanh', 'bình thạnh', 'bt'] },
{ id: 'tan-binh', synonyms: ['tan binh', 'tân bình', 'tb'] },
{ id: 'tan-phu', synonyms: ['tan phu', 'tân phú', 'tp'] },
{ id: 'phu-nhuan', synonyms: ['phu nhuan', 'phú nhuận', 'pn'] },
{ id: 'go-vap', synonyms: ['go vap', 'gò vấp', 'gv'] },
{ id: 'binh-tan', synonyms: ['binh tan', 'bình tân'] },
{ id: 'nha-be', synonyms: ['nha be', 'nhà bè'] },
{ id: 'can-gio', synonyms: ['can gio', 'cần giờ'] },
{ id: 'cu-chi', synonyms: ['cu chi', 'củ chi'] },
{ id: 'hoc-mon', synonyms: ['hoc mon', 'hóc môn'] },
{ id: 'binh-chanh', synonyms: ['binh chanh', 'bình chánh'] },
{ id: 'can-ho', synonyms: ['can ho', 'căn hộ', 'chung cu', 'chung cư'] },
{ id: 'nha-pho', synonyms: ['nha pho', 'nhà phố'] },
{ id: 'biet-thu', synonyms: ['biet thu', 'biệt thự'] },
{ id: 'dat-nen', synonyms: ['dat nen', 'đất nền'] },
{ id: 'hcm', synonyms: ['hcm', 'ho chi minh', 'hồ chí minh', 'tp hcm', 'tphcm', 'sai gon', 'sài gòn'] },
];
const LISTING_SCHEMA: CollectionCreateSchema = {
name: COLLECTION_NAME,
fields: [
@@ -43,6 +79,15 @@ const LISTING_SCHEMA: CollectionCreateSchema = {
{ name: 'legalStatus', type: 'string', facet: true, optional: true },
{ name: 'amenities', type: 'string[]', facet: true, optional: true },
{ name: 'isFeatured', type: 'int32', facet: true },
// Vietnamese diacritic-normalized fields (ASCII-only, for accent-insensitive search)
{ name: 'titleNormalized', type: 'string', facet: false },
{ name: 'descriptionNormalized', type: 'string', facet: false },
{ name: 'addressNormalized', type: 'string', facet: false },
{ name: 'wardNormalized', type: 'string', facet: false },
{ name: 'districtNormalized', type: 'string', facet: false },
{ name: 'cityNormalized', type: 'string', facet: false },
{ name: 'projectNameNormalized', type: 'string', facet: false, optional: true },
],
token_separators: ['-', '_'],
enable_nested_fields: false,
@@ -67,6 +112,31 @@ export class TypesenseSearchRepository implements ISearchRepository {
await this.client.collections().create(LISTING_SCHEMA);
this.logger.log(`Collection "${COLLECTION_NAME}" created`, 'TypesenseSearch');
}
await this.ensureSynonyms();
}
/**
* Upsert Vietnamese district/property-type synonyms into the collection.
* Idempotent — safe to call on every startup.
*/
async ensureSynonyms(): Promise<void> {
try {
for (const syn of VIETNAMESE_SYNONYMS) {
await this.client
.collections(COLLECTION_NAME)
.synonyms()
.upsert(syn.id, { synonyms: syn.synonyms });
}
this.logger.log(
`Upserted ${VIETNAMESE_SYNONYMS.length} Vietnamese synonym rules`,
'TypesenseSearch',
);
} catch (err) {
this.logger.warn(
`Failed to upsert synonyms: ${err instanceof Error ? err.message : String(err)}`,
'TypesenseSearch',
);
}
}
async dropCollection(): Promise<void> {
@@ -121,14 +191,23 @@ export class TypesenseSearchRepository implements ISearchRepository {
filterBy = filterBy ? `${filterBy} && ${geoFilter}` : geoFilter;
}
const rawQuery = params.query || '*';
// For non-wildcard queries, also search the normalized (ASCII) form
// so "can ho" matches "căn hộ" via the normalized fields.
const normalizedQuery = rawQuery !== '*' ? Address.normalize(rawQuery) : rawQuery;
const effectiveQuery = rawQuery !== '*' && normalizedQuery !== rawQuery
? `${rawQuery} ${normalizedQuery}`
: rawQuery;
const searchParams = {
q: params.query || '*',
query_by: 'title,description,address,district,city,projectName',
query_by_weights: '5,3,2,2,1,2',
q: effectiveQuery,
query_by: 'title,description,address,district,city,projectName,titleNormalized,descriptionNormalized,addressNormalized,districtNormalized,cityNormalized,projectNameNormalized',
query_by_weights: '5,3,2,2,1,2,5,3,2,2,1,2',
filter_by: filterBy,
sort_by: this.buildSortBy(params),
page,
per_page: perPage,
num_typos: '2',
highlight_full_fields: 'title,description',
highlight_start_tag: '<mark>',
highlight_end_tag: '</mark>',