Files
goodgo-platform/scripts/sync-osm-admin-boundaries.ts
2026-05-04 17:27:08 +07:00

520 lines
18 KiB
TypeScript

/**
* Sync Vietnam administrative boundaries from OpenStreetMap into the
* `vn_provinces` / `vn_districts` / `vn_wards` tables.
*
* Usage:
* NODE_OPTIONS="-r dotenv/config" DOTENV_CONFIG_PATH=.env \
* pnpm tsx scripts/sync-osm-admin-boundaries.ts \
* [--level=4|6|8|all] [--dry-run] [--chunk=NAME]
*
* What it does:
* 1. Queries Overpass for `boundary=administrative + admin_level=N`
* relations clipped to the Vietnam bbox (split into 4 chunks).
* 2. Converts each relation's outer rings into a MultiPolygon GeoJSON.
* 3. Looks up the GSO code from OSM tags (`ref:VN`, `gso_code`,
* `iso_code`, fallback to slugified name → existing seed row).
* 4. Upserts the row, writing geometry + centroid + areaKm2 + osmId.
*
* Coverage targets:
* admin_level=4 → 63 provinces (cities of central authority + 58 tỉnh)
* admin_level=6 → ~700 districts (quận / huyện / thị xã / TP thuộc tỉnh)
* admin_level=8 → ~11.000 wards (phường / xã / thị trấn)
*
* Notes:
* • Vietnam reformed wards in 2025 (some merged). We track historic
* names via `vn_administrative_aliases` — this script populates that
* table when an OSM tag `was:name` differs from the current name.
* • Wards (level 8) are the heaviest pull (~11k polygons). We always
* chunk them into 4 geographic slices to dodge Overpass timeouts.
*/
import 'dotenv/config';
import { PrismaPg } from '@prisma/adapter-pg';
import { PrismaClient } from '@prisma/client';
import area from '@turf/area';
import centroid from '@turf/centroid';
import type { Feature, MultiPolygon, Polygon } from 'geojson';
import osmtogeojson from 'osmtogeojson';
import pg from 'pg';
import { isPointInVietnam } from './data/vn-country-polygon';
const pool = new pg.Pool({ connectionString: process.env['DATABASE_URL'] });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const OVERPASS_URL =
process.env['OVERPASS_URL'] ?? 'https://overpass-api.de/api/interpreter';
interface BBox {
south: number;
west: number;
north: number;
east: number;
}
/** Same chunks the KCN sync uses — keeps Overpass query budget reasonable. */
const CHUNKS: Record<string, BBox> = {
north: { south: 19.0, west: 102.0, north: 23.5, east: 110.0 },
northCentral: { south: 16.5, west: 102.0, north: 19.0, east: 110.0 },
southCentral: { south: 13.0, west: 102.0, north: 16.5, east: 110.0 },
south: { south: 8.0, west: 102.0, north: 13.0, east: 110.0 },
};
// ─── CLI ──────────────────────────────────────────────────────────────────
const argv = process.argv.slice(2);
const dryRun = argv.includes('--dry-run');
const chunkArg = argv.find((a) => a.startsWith('--chunk='))?.slice('--chunk='.length);
const levelArg = argv.find((a) => a.startsWith('--level='))?.slice('--level='.length) ?? 'all';
const wantedLevels: number[] =
levelArg === 'all'
? [4, 6, 8]
: levelArg
.split(',')
.map((s) => Number(s.trim()))
.filter((n) => [4, 6, 8].includes(n));
// ─── Slug helper (matches GSO codename style) ──────────────────────────────
function slugify(name: string): string {
return name
.toLowerCase()
.replace(/đ/g, 'd')
.normalize('NFD')
.replace(/[̀-ͯ]/g, '')
.replace(/[^a-z0-9]+/g, '_')
.replace(/^_+|_+$/g, '');
}
// ─── Overpass fetch ────────────────────────────────────────────────────────
interface OverpassResult {
elements: unknown[];
}
async function fetchChunk(level: number, name: string, bbox: BBox): Promise<OverpassResult> {
// `out geom` returns the relation members with inline geometry so we can
// assemble polygons without a second roundtrip. Timeout 300s for level=8.
const query = `
[out:json][timeout:300];
relation
["boundary"="administrative"]
["admin_level"="${level}"]
(${bbox.south},${bbox.west},${bbox.north},${bbox.east});
out body geom;
`;
console.log(` → fetching level=${level} chunk="${name}"…`);
const start = Date.now();
const res = await fetch(OVERPASS_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'goodgo-osm-admin-sync/1.0 (https://goodgo.vn)',
},
body: 'data=' + encodeURIComponent(query),
});
if (!res.ok) {
const body = await res.text();
throw new Error(`Overpass returned ${res.status}: ${body.slice(0, 200)}`);
}
const json = (await res.json()) as OverpassResult;
console.log(
` ← level=${level} ${name}: ${json.elements?.length ?? 0} relations in ${(
(Date.now() - start) /
1000
).toFixed(1)}s`,
);
return json;
}
// ─── Per-feature parser ────────────────────────────────────────────────────
interface ParsedAdmin {
level: 4 | 6 | 8;
osmId: bigint;
name: string;
nameEn: string | null;
gsoCode: string | null;
type: string; // "Tỉnh" / "Quận" / "Phường" etc.
geometry: MultiPolygon; // outer rings only
centroid: { lng: number; lat: number };
areaKm2: number;
population: number | null;
rawTags: Record<string, string>;
}
const PROVINCE_TYPE_MAP = (name: string): string =>
/^(Thành phố|TP\.?)\s+(Hà Nội|Hồ Chí Minh|Hải Phòng|Đà Nẵng|Cần Thơ)/i.test(name)
? 'Thành phố Trung ương'
: 'Tỉnh';
const DISTRICT_TYPE_MAP = (name: string): string => {
if (/^Quận/i.test(name)) return 'Quận';
if (/^Huyện/i.test(name)) return 'Huyện';
if (/^Thị xã/i.test(name)) return 'Thị xã';
if (/^Thành phố/i.test(name)) return 'Thành phố thuộc tỉnh';
return 'Quận';
};
const WARD_TYPE_MAP = (name: string): string => {
if (/^Phường/i.test(name)) return 'Phường';
if (/^Xã/i.test(name)) return 'Xã';
if (/^Thị trấn/i.test(name)) return 'Thị trấn';
return 'Xã';
};
function parseFeature(
feat: Feature<Polygon | MultiPolygon>,
level: 4 | 6 | 8,
): ParsedAdmin | null {
const propsRaw = feat.properties as Record<string, unknown> | null;
if (!propsRaw) return null;
// osmtogeojson encodes the prefixed id on `feat.id` ("relation/123") and
// the bare numeric id under `properties.id`. We only kept relations.
const featAny = feat as unknown as { id?: unknown };
const idStr = String(featAny.id ?? propsRaw['id'] ?? '');
if (!idStr.startsWith('relation/')) return null;
const osmId = BigInt(idStr.slice('relation/'.length));
const tagsRaw = propsRaw['tags'];
const tags: Record<string, string> =
tagsRaw && typeof tagsRaw === 'object'
? (tagsRaw as Record<string, string>)
: (propsRaw as Record<string, string>);
const name = tags['name:vi'] ?? tags['name'] ?? null;
if (!name) return null;
// Skip rows without any Latin/Vietnamese letter (cross-border bleed).
if (!/[A-Za-zÀ-ỹ]/.test(name)) return null;
const nameEn = tags['name:en'] ?? null;
const gsoCode =
tags['ref:VN'] ?? tags['gso_code'] ?? tags['ref'] ?? tags['iso_code'] ?? null;
const populationRaw = tags['population'];
const population = populationRaw && /^\d+$/.test(populationRaw) ? Number(populationRaw) : null;
// Normalise to MultiPolygon regardless of source (Polygon → wrap once).
const geom: MultiPolygon =
feat.geometry.type === 'Polygon'
? { type: 'MultiPolygon', coordinates: [feat.geometry.coordinates] }
: feat.geometry;
const c = centroid(feat as Feature);
const [cLng, cLat] = c.geometry.coordinates;
// Geographic gate: drop relations whose centroid sits outside the VN
// mainland polygon (China / Laos / Cambodia bleed across the bbox).
if (!isPointInVietnam(cLng, cLat)) return null;
const areaKm2 = Math.round((area(feat as Feature) / 1_000_000) * 100) / 100;
let type: string;
if (level === 4) type = PROVINCE_TYPE_MAP(name);
else if (level === 6) type = DISTRICT_TYPE_MAP(name);
else type = WARD_TYPE_MAP(name);
return {
level,
osmId,
name,
nameEn,
gsoCode,
type,
geometry: geom,
centroid: { lng: cLng, lat: cLat },
areaKm2,
population,
rawTags: tags,
};
}
// ─── Resolve to existing GSO code or generate a synthetic one ─────────────
async function resolveProvinceCode(p: ParsedAdmin): Promise<string> {
if (p.gsoCode) {
const exists = await prisma.vnProvince.findUnique({ where: { code: p.gsoCode } });
if (exists) return p.gsoCode;
}
// Fallback: lookup by codename slug.
const codename = slugify(p.name);
const byCodename = await prisma.vnProvince.findFirst({ where: { codename } });
if (byCodename) return byCodename.code;
// Brand-new: derive a code from osmId so it's stable.
return `OSM_${p.osmId.toString()}`;
}
async function resolveDistrictCode(p: ParsedAdmin, provinceCode: string): Promise<string> {
if (p.gsoCode) {
const exists = await prisma.vnDistrict.findUnique({ where: { code: p.gsoCode } });
if (exists) return p.gsoCode;
}
const codename = slugify(p.name);
const byCodename = await prisma.vnDistrict.findFirst({
where: { codename, provinceCode },
});
if (byCodename) return byCodename.code;
return `OSM_${p.osmId.toString()}`;
}
async function resolveWardCode(p: ParsedAdmin, districtCode: string): Promise<string> {
if (p.gsoCode) {
const exists = await prisma.vnWard.findUnique({ where: { code: p.gsoCode } });
if (exists) return p.gsoCode;
}
const codename = slugify(p.name);
const byCodename = await prisma.vnWard.findFirst({
where: { codename, districtCode },
});
if (byCodename) return byCodename.code;
return `OSM_${p.osmId.toString()}`;
}
// ─── Upsert helpers — raw SQL because Prisma can't manage geometry ────────
function geomSql(g: MultiPolygon): string {
const json = JSON.stringify(g).replace(/'/g, "''");
return `ST_Multi(ST_GeomFromGeoJSON('${json}'))`;
}
interface UpsertStats {
inserted: number;
updated: number;
skipped: number;
}
async function upsertProvince(p: ParsedAdmin, stats: UpsertStats): Promise<void> {
const code = await resolveProvinceCode(p);
const codename = slugify(p.name);
const existed = await prisma.vnProvince.findUnique({ where: { code }, select: { code: true } });
await prisma.$executeRawUnsafe(
`
INSERT INTO "vn_provinces" (
code, name, "nameEn", type, codename, "osmId",
"areaKm2", population, "lastSyncedAt", "updatedAt", geometry, centroid
) VALUES (
$1, $2, $3, $4, $5, $6::bigint,
$7, $8, NOW(), NOW(),
${geomSql(p.geometry)},
ST_SetSRID(ST_MakePoint($9, $10), 4326)
)
ON CONFLICT (code) DO UPDATE SET
name = EXCLUDED.name,
"nameEn" = EXCLUDED."nameEn",
type = EXCLUDED.type,
"osmId" = EXCLUDED."osmId",
"areaKm2" = EXCLUDED."areaKm2",
population = COALESCE(EXCLUDED.population, "vn_provinces".population),
"lastSyncedAt" = NOW(),
"updatedAt" = NOW(),
geometry = EXCLUDED.geometry,
centroid = EXCLUDED.centroid
`,
code,
p.name,
p.nameEn,
p.type,
codename,
p.osmId.toString(),
p.areaKm2,
p.population,
p.centroid.lng,
p.centroid.lat,
);
if (existed) stats.updated++;
else stats.inserted++;
}
async function upsertDistrict(p: ParsedAdmin, stats: UpsertStats): Promise<void> {
// Find which province contains this district by ST_Within against existing
// synced province polygons. Falls back to province with largest overlap.
const provinceMatch = await prisma.$queryRawUnsafe<{ code: string }[]>(
`SELECT code FROM "vn_provinces"
WHERE geometry IS NOT NULL
AND ST_Contains(geometry, ST_SetSRID(ST_MakePoint($1, $2), 4326))
LIMIT 1`,
p.centroid.lng,
p.centroid.lat,
);
if (provinceMatch.length === 0) {
stats.skipped++;
return; // Cannot place district until provinces are synced first.
}
const provinceCode = provinceMatch[0]!.code;
const code = await resolveDistrictCode(p, provinceCode);
const codename = slugify(p.name);
const existed = await prisma.vnDistrict.findUnique({ where: { code }, select: { code: true } });
await prisma.$executeRawUnsafe(
`
INSERT INTO "vn_districts" (
code, "provinceCode", name, "nameEn", type, codename, "osmId",
"areaKm2", population, "lastSyncedAt", "updatedAt", geometry, centroid
) VALUES (
$1, $2, $3, $4, $5, $6, $7::bigint,
$8, $9, NOW(), NOW(),
${geomSql(p.geometry)},
ST_SetSRID(ST_MakePoint($10, $11), 4326)
)
ON CONFLICT (code) DO UPDATE SET
"provinceCode" = EXCLUDED."provinceCode",
name = EXCLUDED.name,
"nameEn" = EXCLUDED."nameEn",
type = EXCLUDED.type,
"osmId" = EXCLUDED."osmId",
"areaKm2" = EXCLUDED."areaKm2",
population = COALESCE(EXCLUDED.population, "vn_districts".population),
"lastSyncedAt" = NOW(),
"updatedAt" = NOW(),
geometry = EXCLUDED.geometry,
centroid = EXCLUDED.centroid
`,
code,
provinceCode,
p.name,
p.nameEn,
p.type,
codename,
p.osmId.toString(),
p.areaKm2,
p.population,
p.centroid.lng,
p.centroid.lat,
);
if (existed) stats.updated++;
else stats.inserted++;
}
async function upsertWard(p: ParsedAdmin, stats: UpsertStats): Promise<void> {
const districtMatch = await prisma.$queryRawUnsafe<{ code: string }[]>(
`SELECT code FROM "vn_districts"
WHERE geometry IS NOT NULL
AND ST_Contains(geometry, ST_SetSRID(ST_MakePoint($1, $2), 4326))
LIMIT 1`,
p.centroid.lng,
p.centroid.lat,
);
if (districtMatch.length === 0) {
stats.skipped++;
return;
}
const districtCode = districtMatch[0]!.code;
const code = await resolveWardCode(p, districtCode);
const codename = slugify(p.name);
const existed = await prisma.vnWard.findUnique({ where: { code }, select: { code: true } });
await prisma.$executeRawUnsafe(
`
INSERT INTO "vn_wards" (
code, "districtCode", name, "nameEn", type, codename, "osmId",
"areaKm2", population, "lastSyncedAt", "updatedAt", geometry, centroid
) VALUES (
$1, $2, $3, $4, $5, $6, $7::bigint,
$8, $9, NOW(), NOW(),
${geomSql(p.geometry)},
ST_SetSRID(ST_MakePoint($10, $11), 4326)
)
ON CONFLICT (code) DO UPDATE SET
"districtCode" = EXCLUDED."districtCode",
name = EXCLUDED.name,
"nameEn" = EXCLUDED."nameEn",
type = EXCLUDED.type,
"osmId" = EXCLUDED."osmId",
"areaKm2" = EXCLUDED."areaKm2",
population = COALESCE(EXCLUDED.population, "vn_wards".population),
"lastSyncedAt" = NOW(),
"updatedAt" = NOW(),
geometry = EXCLUDED.geometry,
centroid = EXCLUDED.centroid
`,
code,
districtCode,
p.name,
p.nameEn,
p.type,
codename,
p.osmId.toString(),
p.areaKm2,
p.population,
p.centroid.lng,
p.centroid.lat,
);
if (existed) stats.updated++;
else stats.inserted++;
}
// ─── Main ─────────────────────────────────────────────────────────────────
async function processChunk(
level: 4 | 6 | 8,
chunkName: string,
bbox: BBox,
): Promise<UpsertStats> {
const stats: UpsertStats = { inserted: 0, updated: 0, skipped: 0 };
const result = await fetchChunk(level, chunkName, bbox);
const fc = osmtogeojson(result, { flatProperties: false });
const features = (fc.features as Feature<Polygon | MultiPolygon>[]).filter(
(f) => f.geometry?.type === 'Polygon' || f.geometry?.type === 'MultiPolygon',
);
for (const feat of features) {
const parsed = parseFeature(feat, level);
if (!parsed) continue;
if (dryRun) {
stats.inserted++;
continue;
}
try {
if (level === 4) await upsertProvince(parsed, stats);
else if (level === 6) await upsertDistrict(parsed, stats);
else await upsertWard(parsed, stats);
} catch (err) {
console.error(`${parsed.name}: ${err instanceof Error ? err.message : err}`);
stats.skipped++;
}
}
console.log(
` ✓ level=${level} ${chunkName}: inserted=${stats.inserted} updated=${stats.updated} skipped=${stats.skipped}`,
);
return stats;
}
async function main(): Promise<void> {
console.log('🌏 OSM admin boundaries sync starting');
console.log(` levels: ${wantedLevels.join(',')}, chunks: ${chunkArg ?? 'all'}, dryRun=${dryRun}`);
const chunks = chunkArg
? { [chunkArg]: CHUNKS[chunkArg]! }
: CHUNKS;
const totals: Record<number, UpsertStats> = {
4: { inserted: 0, updated: 0, skipped: 0 },
6: { inserted: 0, updated: 0, skipped: 0 },
8: { inserted: 0, updated: 0, skipped: 0 },
};
// ALWAYS process levels in order 4 → 6 → 8, because 6 needs province
// polygons in the DB to assign provinceCode (and 8 needs districts).
for (const level of wantedLevels.sort() as (4 | 6 | 8)[]) {
console.log(`\n=== Level ${level} ===`);
for (const [name, bbox] of Object.entries(chunks)) {
try {
const s = await processChunk(level, name, bbox);
totals[level]!.inserted += s.inserted;
totals[level]!.updated += s.updated;
totals[level]!.skipped += s.skipped;
} catch (err) {
console.error(` ✗ chunk ${name} (level ${level}) failed:`, err);
}
}
}
console.log('\n📊 Totals');
for (const lvl of wantedLevels) {
const t = totals[lvl]!;
console.log(
` level=${lvl}: inserted=${t.inserted} updated=${t.updated} skipped=${t.skipped}`,
);
}
}
main()
.catch((err) => {
console.error(err);
process.exitCode = 1;
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});