520 lines
18 KiB
TypeScript
520 lines
18 KiB
TypeScript
/**
|
|
* Sync Vietnam administrative boundaries from OpenStreetMap into the
|
|
* `vn_provinces` / `vn_districts` / `vn_wards` tables.
|
|
*
|
|
* Usage:
|
|
* NODE_OPTIONS="-r dotenv/config" DOTENV_CONFIG_PATH=.env \
|
|
* pnpm tsx scripts/sync-osm-admin-boundaries.ts \
|
|
* [--level=4|6|8|all] [--dry-run] [--chunk=NAME]
|
|
*
|
|
* What it does:
|
|
* 1. Queries Overpass for `boundary=administrative + admin_level=N`
|
|
* relations clipped to the Vietnam bbox (split into 4 chunks).
|
|
* 2. Converts each relation's outer rings into a MultiPolygon GeoJSON.
|
|
* 3. Looks up the GSO code from OSM tags (`ref:VN`, `gso_code`,
|
|
* `iso_code`, fallback to slugified name → existing seed row).
|
|
* 4. Upserts the row, writing geometry + centroid + areaKm2 + osmId.
|
|
*
|
|
* Coverage targets:
|
|
* admin_level=4 → 63 provinces (cities of central authority + 58 tỉnh)
|
|
* admin_level=6 → ~700 districts (quận / huyện / thị xã / TP thuộc tỉnh)
|
|
* admin_level=8 → ~11.000 wards (phường / xã / thị trấn)
|
|
*
|
|
* Notes:
|
|
* • Vietnam reformed wards in 2025 (some merged). We track historic
|
|
* names via `vn_administrative_aliases` — this script populates that
|
|
* table when an OSM tag `was:name` differs from the current name.
|
|
* • Wards (level 8) are the heaviest pull (~11k polygons). We always
|
|
* chunk them into 4 geographic slices to dodge Overpass timeouts.
|
|
*/
|
|
import 'dotenv/config';
|
|
import { PrismaPg } from '@prisma/adapter-pg';
|
|
import { PrismaClient } from '@prisma/client';
|
|
import area from '@turf/area';
|
|
import centroid from '@turf/centroid';
|
|
import type { Feature, MultiPolygon, Polygon } from 'geojson';
|
|
import osmtogeojson from 'osmtogeojson';
|
|
import pg from 'pg';
|
|
import { isPointInVietnam } from './data/vn-country-polygon';
|
|
|
|
const pool = new pg.Pool({ connectionString: process.env['DATABASE_URL'] });
|
|
const adapter = new PrismaPg(pool);
|
|
const prisma = new PrismaClient({ adapter });
|
|
|
|
const OVERPASS_URL =
|
|
process.env['OVERPASS_URL'] ?? 'https://overpass-api.de/api/interpreter';
|
|
|
|
interface BBox {
|
|
south: number;
|
|
west: number;
|
|
north: number;
|
|
east: number;
|
|
}
|
|
|
|
/** Same chunks the KCN sync uses — keeps Overpass query budget reasonable. */
|
|
const CHUNKS: Record<string, BBox> = {
|
|
north: { south: 19.0, west: 102.0, north: 23.5, east: 110.0 },
|
|
northCentral: { south: 16.5, west: 102.0, north: 19.0, east: 110.0 },
|
|
southCentral: { south: 13.0, west: 102.0, north: 16.5, east: 110.0 },
|
|
south: { south: 8.0, west: 102.0, north: 13.0, east: 110.0 },
|
|
};
|
|
|
|
// ─── CLI ──────────────────────────────────────────────────────────────────
|
|
const argv = process.argv.slice(2);
|
|
const dryRun = argv.includes('--dry-run');
|
|
const chunkArg = argv.find((a) => a.startsWith('--chunk='))?.slice('--chunk='.length);
|
|
const levelArg = argv.find((a) => a.startsWith('--level='))?.slice('--level='.length) ?? 'all';
|
|
const wantedLevels: number[] =
|
|
levelArg === 'all'
|
|
? [4, 6, 8]
|
|
: levelArg
|
|
.split(',')
|
|
.map((s) => Number(s.trim()))
|
|
.filter((n) => [4, 6, 8].includes(n));
|
|
|
|
// ─── Slug helper (matches GSO codename style) ──────────────────────────────
|
|
function slugify(name: string): string {
|
|
return name
|
|
.toLowerCase()
|
|
.replace(/đ/g, 'd')
|
|
.normalize('NFD')
|
|
.replace(/[̀-ͯ]/g, '')
|
|
.replace(/[^a-z0-9]+/g, '_')
|
|
.replace(/^_+|_+$/g, '');
|
|
}
|
|
|
|
// ─── Overpass fetch ────────────────────────────────────────────────────────
|
|
interface OverpassResult {
|
|
elements: unknown[];
|
|
}
|
|
|
|
async function fetchChunk(level: number, name: string, bbox: BBox): Promise<OverpassResult> {
|
|
// `out geom` returns the relation members with inline geometry so we can
|
|
// assemble polygons without a second roundtrip. Timeout 300s for level=8.
|
|
const query = `
|
|
[out:json][timeout:300];
|
|
relation
|
|
["boundary"="administrative"]
|
|
["admin_level"="${level}"]
|
|
(${bbox.south},${bbox.west},${bbox.north},${bbox.east});
|
|
out body geom;
|
|
`;
|
|
console.log(` → fetching level=${level} chunk="${name}"…`);
|
|
const start = Date.now();
|
|
const res = await fetch(OVERPASS_URL, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/x-www-form-urlencoded',
|
|
'User-Agent': 'goodgo-osm-admin-sync/1.0 (https://goodgo.vn)',
|
|
},
|
|
body: 'data=' + encodeURIComponent(query),
|
|
});
|
|
if (!res.ok) {
|
|
const body = await res.text();
|
|
throw new Error(`Overpass returned ${res.status}: ${body.slice(0, 200)}`);
|
|
}
|
|
const json = (await res.json()) as OverpassResult;
|
|
console.log(
|
|
` ← level=${level} ${name}: ${json.elements?.length ?? 0} relations in ${(
|
|
(Date.now() - start) /
|
|
1000
|
|
).toFixed(1)}s`,
|
|
);
|
|
return json;
|
|
}
|
|
|
|
// ─── Per-feature parser ────────────────────────────────────────────────────
|
|
interface ParsedAdmin {
|
|
level: 4 | 6 | 8;
|
|
osmId: bigint;
|
|
name: string;
|
|
nameEn: string | null;
|
|
gsoCode: string | null;
|
|
type: string; // "Tỉnh" / "Quận" / "Phường" etc.
|
|
geometry: MultiPolygon; // outer rings only
|
|
centroid: { lng: number; lat: number };
|
|
areaKm2: number;
|
|
population: number | null;
|
|
rawTags: Record<string, string>;
|
|
}
|
|
|
|
const PROVINCE_TYPE_MAP = (name: string): string =>
|
|
/^(Thành phố|TP\.?)\s+(Hà Nội|Hồ Chí Minh|Hải Phòng|Đà Nẵng|Cần Thơ)/i.test(name)
|
|
? 'Thành phố Trung ương'
|
|
: 'Tỉnh';
|
|
|
|
const DISTRICT_TYPE_MAP = (name: string): string => {
|
|
if (/^Quận/i.test(name)) return 'Quận';
|
|
if (/^Huyện/i.test(name)) return 'Huyện';
|
|
if (/^Thị xã/i.test(name)) return 'Thị xã';
|
|
if (/^Thành phố/i.test(name)) return 'Thành phố thuộc tỉnh';
|
|
return 'Quận';
|
|
};
|
|
|
|
const WARD_TYPE_MAP = (name: string): string => {
|
|
if (/^Phường/i.test(name)) return 'Phường';
|
|
if (/^Xã/i.test(name)) return 'Xã';
|
|
if (/^Thị trấn/i.test(name)) return 'Thị trấn';
|
|
return 'Xã';
|
|
};
|
|
|
|
function parseFeature(
|
|
feat: Feature<Polygon | MultiPolygon>,
|
|
level: 4 | 6 | 8,
|
|
): ParsedAdmin | null {
|
|
const propsRaw = feat.properties as Record<string, unknown> | null;
|
|
if (!propsRaw) return null;
|
|
|
|
// osmtogeojson encodes the prefixed id on `feat.id` ("relation/123") and
|
|
// the bare numeric id under `properties.id`. We only kept relations.
|
|
const featAny = feat as unknown as { id?: unknown };
|
|
const idStr = String(featAny.id ?? propsRaw['id'] ?? '');
|
|
if (!idStr.startsWith('relation/')) return null;
|
|
const osmId = BigInt(idStr.slice('relation/'.length));
|
|
|
|
const tagsRaw = propsRaw['tags'];
|
|
const tags: Record<string, string> =
|
|
tagsRaw && typeof tagsRaw === 'object'
|
|
? (tagsRaw as Record<string, string>)
|
|
: (propsRaw as Record<string, string>);
|
|
|
|
const name = tags['name:vi'] ?? tags['name'] ?? null;
|
|
if (!name) return null;
|
|
// Skip rows without any Latin/Vietnamese letter (cross-border bleed).
|
|
if (!/[A-Za-zÀ-ỹ]/.test(name)) return null;
|
|
|
|
const nameEn = tags['name:en'] ?? null;
|
|
const gsoCode =
|
|
tags['ref:VN'] ?? tags['gso_code'] ?? tags['ref'] ?? tags['iso_code'] ?? null;
|
|
const populationRaw = tags['population'];
|
|
const population = populationRaw && /^\d+$/.test(populationRaw) ? Number(populationRaw) : null;
|
|
|
|
// Normalise to MultiPolygon regardless of source (Polygon → wrap once).
|
|
const geom: MultiPolygon =
|
|
feat.geometry.type === 'Polygon'
|
|
? { type: 'MultiPolygon', coordinates: [feat.geometry.coordinates] }
|
|
: feat.geometry;
|
|
const c = centroid(feat as Feature);
|
|
const [cLng, cLat] = c.geometry.coordinates;
|
|
// Geographic gate: drop relations whose centroid sits outside the VN
|
|
// mainland polygon (China / Laos / Cambodia bleed across the bbox).
|
|
if (!isPointInVietnam(cLng, cLat)) return null;
|
|
const areaKm2 = Math.round((area(feat as Feature) / 1_000_000) * 100) / 100;
|
|
|
|
let type: string;
|
|
if (level === 4) type = PROVINCE_TYPE_MAP(name);
|
|
else if (level === 6) type = DISTRICT_TYPE_MAP(name);
|
|
else type = WARD_TYPE_MAP(name);
|
|
|
|
return {
|
|
level,
|
|
osmId,
|
|
name,
|
|
nameEn,
|
|
gsoCode,
|
|
type,
|
|
geometry: geom,
|
|
centroid: { lng: cLng, lat: cLat },
|
|
areaKm2,
|
|
population,
|
|
rawTags: tags,
|
|
};
|
|
}
|
|
|
|
// ─── Resolve to existing GSO code or generate a synthetic one ─────────────
|
|
async function resolveProvinceCode(p: ParsedAdmin): Promise<string> {
|
|
if (p.gsoCode) {
|
|
const exists = await prisma.vnProvince.findUnique({ where: { code: p.gsoCode } });
|
|
if (exists) return p.gsoCode;
|
|
}
|
|
// Fallback: lookup by codename slug.
|
|
const codename = slugify(p.name);
|
|
const byCodename = await prisma.vnProvince.findFirst({ where: { codename } });
|
|
if (byCodename) return byCodename.code;
|
|
// Brand-new: derive a code from osmId so it's stable.
|
|
return `OSM_${p.osmId.toString()}`;
|
|
}
|
|
|
|
async function resolveDistrictCode(p: ParsedAdmin, provinceCode: string): Promise<string> {
|
|
if (p.gsoCode) {
|
|
const exists = await prisma.vnDistrict.findUnique({ where: { code: p.gsoCode } });
|
|
if (exists) return p.gsoCode;
|
|
}
|
|
const codename = slugify(p.name);
|
|
const byCodename = await prisma.vnDistrict.findFirst({
|
|
where: { codename, provinceCode },
|
|
});
|
|
if (byCodename) return byCodename.code;
|
|
return `OSM_${p.osmId.toString()}`;
|
|
}
|
|
|
|
async function resolveWardCode(p: ParsedAdmin, districtCode: string): Promise<string> {
|
|
if (p.gsoCode) {
|
|
const exists = await prisma.vnWard.findUnique({ where: { code: p.gsoCode } });
|
|
if (exists) return p.gsoCode;
|
|
}
|
|
const codename = slugify(p.name);
|
|
const byCodename = await prisma.vnWard.findFirst({
|
|
where: { codename, districtCode },
|
|
});
|
|
if (byCodename) return byCodename.code;
|
|
return `OSM_${p.osmId.toString()}`;
|
|
}
|
|
|
|
// ─── Upsert helpers — raw SQL because Prisma can't manage geometry ────────
|
|
function geomSql(g: MultiPolygon): string {
|
|
const json = JSON.stringify(g).replace(/'/g, "''");
|
|
return `ST_Multi(ST_GeomFromGeoJSON('${json}'))`;
|
|
}
|
|
|
|
interface UpsertStats {
|
|
inserted: number;
|
|
updated: number;
|
|
skipped: number;
|
|
}
|
|
|
|
async function upsertProvince(p: ParsedAdmin, stats: UpsertStats): Promise<void> {
|
|
const code = await resolveProvinceCode(p);
|
|
const codename = slugify(p.name);
|
|
const existed = await prisma.vnProvince.findUnique({ where: { code }, select: { code: true } });
|
|
|
|
await prisma.$executeRawUnsafe(
|
|
`
|
|
INSERT INTO "vn_provinces" (
|
|
code, name, "nameEn", type, codename, "osmId",
|
|
"areaKm2", population, "lastSyncedAt", "updatedAt", geometry, centroid
|
|
) VALUES (
|
|
$1, $2, $3, $4, $5, $6::bigint,
|
|
$7, $8, NOW(), NOW(),
|
|
${geomSql(p.geometry)},
|
|
ST_SetSRID(ST_MakePoint($9, $10), 4326)
|
|
)
|
|
ON CONFLICT (code) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
"nameEn" = EXCLUDED."nameEn",
|
|
type = EXCLUDED.type,
|
|
"osmId" = EXCLUDED."osmId",
|
|
"areaKm2" = EXCLUDED."areaKm2",
|
|
population = COALESCE(EXCLUDED.population, "vn_provinces".population),
|
|
"lastSyncedAt" = NOW(),
|
|
"updatedAt" = NOW(),
|
|
geometry = EXCLUDED.geometry,
|
|
centroid = EXCLUDED.centroid
|
|
`,
|
|
code,
|
|
p.name,
|
|
p.nameEn,
|
|
p.type,
|
|
codename,
|
|
p.osmId.toString(),
|
|
p.areaKm2,
|
|
p.population,
|
|
p.centroid.lng,
|
|
p.centroid.lat,
|
|
);
|
|
if (existed) stats.updated++;
|
|
else stats.inserted++;
|
|
}
|
|
|
|
async function upsertDistrict(p: ParsedAdmin, stats: UpsertStats): Promise<void> {
|
|
// Find which province contains this district by ST_Within against existing
|
|
// synced province polygons. Falls back to province with largest overlap.
|
|
const provinceMatch = await prisma.$queryRawUnsafe<{ code: string }[]>(
|
|
`SELECT code FROM "vn_provinces"
|
|
WHERE geometry IS NOT NULL
|
|
AND ST_Contains(geometry, ST_SetSRID(ST_MakePoint($1, $2), 4326))
|
|
LIMIT 1`,
|
|
p.centroid.lng,
|
|
p.centroid.lat,
|
|
);
|
|
if (provinceMatch.length === 0) {
|
|
stats.skipped++;
|
|
return; // Cannot place district until provinces are synced first.
|
|
}
|
|
const provinceCode = provinceMatch[0]!.code;
|
|
const code = await resolveDistrictCode(p, provinceCode);
|
|
const codename = slugify(p.name);
|
|
const existed = await prisma.vnDistrict.findUnique({ where: { code }, select: { code: true } });
|
|
|
|
await prisma.$executeRawUnsafe(
|
|
`
|
|
INSERT INTO "vn_districts" (
|
|
code, "provinceCode", name, "nameEn", type, codename, "osmId",
|
|
"areaKm2", population, "lastSyncedAt", "updatedAt", geometry, centroid
|
|
) VALUES (
|
|
$1, $2, $3, $4, $5, $6, $7::bigint,
|
|
$8, $9, NOW(), NOW(),
|
|
${geomSql(p.geometry)},
|
|
ST_SetSRID(ST_MakePoint($10, $11), 4326)
|
|
)
|
|
ON CONFLICT (code) DO UPDATE SET
|
|
"provinceCode" = EXCLUDED."provinceCode",
|
|
name = EXCLUDED.name,
|
|
"nameEn" = EXCLUDED."nameEn",
|
|
type = EXCLUDED.type,
|
|
"osmId" = EXCLUDED."osmId",
|
|
"areaKm2" = EXCLUDED."areaKm2",
|
|
population = COALESCE(EXCLUDED.population, "vn_districts".population),
|
|
"lastSyncedAt" = NOW(),
|
|
"updatedAt" = NOW(),
|
|
geometry = EXCLUDED.geometry,
|
|
centroid = EXCLUDED.centroid
|
|
`,
|
|
code,
|
|
provinceCode,
|
|
p.name,
|
|
p.nameEn,
|
|
p.type,
|
|
codename,
|
|
p.osmId.toString(),
|
|
p.areaKm2,
|
|
p.population,
|
|
p.centroid.lng,
|
|
p.centroid.lat,
|
|
);
|
|
if (existed) stats.updated++;
|
|
else stats.inserted++;
|
|
}
|
|
|
|
async function upsertWard(p: ParsedAdmin, stats: UpsertStats): Promise<void> {
|
|
const districtMatch = await prisma.$queryRawUnsafe<{ code: string }[]>(
|
|
`SELECT code FROM "vn_districts"
|
|
WHERE geometry IS NOT NULL
|
|
AND ST_Contains(geometry, ST_SetSRID(ST_MakePoint($1, $2), 4326))
|
|
LIMIT 1`,
|
|
p.centroid.lng,
|
|
p.centroid.lat,
|
|
);
|
|
if (districtMatch.length === 0) {
|
|
stats.skipped++;
|
|
return;
|
|
}
|
|
const districtCode = districtMatch[0]!.code;
|
|
const code = await resolveWardCode(p, districtCode);
|
|
const codename = slugify(p.name);
|
|
const existed = await prisma.vnWard.findUnique({ where: { code }, select: { code: true } });
|
|
|
|
await prisma.$executeRawUnsafe(
|
|
`
|
|
INSERT INTO "vn_wards" (
|
|
code, "districtCode", name, "nameEn", type, codename, "osmId",
|
|
"areaKm2", population, "lastSyncedAt", "updatedAt", geometry, centroid
|
|
) VALUES (
|
|
$1, $2, $3, $4, $5, $6, $7::bigint,
|
|
$8, $9, NOW(), NOW(),
|
|
${geomSql(p.geometry)},
|
|
ST_SetSRID(ST_MakePoint($10, $11), 4326)
|
|
)
|
|
ON CONFLICT (code) DO UPDATE SET
|
|
"districtCode" = EXCLUDED."districtCode",
|
|
name = EXCLUDED.name,
|
|
"nameEn" = EXCLUDED."nameEn",
|
|
type = EXCLUDED.type,
|
|
"osmId" = EXCLUDED."osmId",
|
|
"areaKm2" = EXCLUDED."areaKm2",
|
|
population = COALESCE(EXCLUDED.population, "vn_wards".population),
|
|
"lastSyncedAt" = NOW(),
|
|
"updatedAt" = NOW(),
|
|
geometry = EXCLUDED.geometry,
|
|
centroid = EXCLUDED.centroid
|
|
`,
|
|
code,
|
|
districtCode,
|
|
p.name,
|
|
p.nameEn,
|
|
p.type,
|
|
codename,
|
|
p.osmId.toString(),
|
|
p.areaKm2,
|
|
p.population,
|
|
p.centroid.lng,
|
|
p.centroid.lat,
|
|
);
|
|
if (existed) stats.updated++;
|
|
else stats.inserted++;
|
|
}
|
|
|
|
// ─── Main ─────────────────────────────────────────────────────────────────
|
|
async function processChunk(
|
|
level: 4 | 6 | 8,
|
|
chunkName: string,
|
|
bbox: BBox,
|
|
): Promise<UpsertStats> {
|
|
const stats: UpsertStats = { inserted: 0, updated: 0, skipped: 0 };
|
|
const result = await fetchChunk(level, chunkName, bbox);
|
|
const fc = osmtogeojson(result, { flatProperties: false });
|
|
const features = (fc.features as Feature<Polygon | MultiPolygon>[]).filter(
|
|
(f) => f.geometry?.type === 'Polygon' || f.geometry?.type === 'MultiPolygon',
|
|
);
|
|
|
|
for (const feat of features) {
|
|
const parsed = parseFeature(feat, level);
|
|
if (!parsed) continue;
|
|
if (dryRun) {
|
|
stats.inserted++;
|
|
continue;
|
|
}
|
|
try {
|
|
if (level === 4) await upsertProvince(parsed, stats);
|
|
else if (level === 6) await upsertDistrict(parsed, stats);
|
|
else await upsertWard(parsed, stats);
|
|
} catch (err) {
|
|
console.error(` ✗ ${parsed.name}: ${err instanceof Error ? err.message : err}`);
|
|
stats.skipped++;
|
|
}
|
|
}
|
|
console.log(
|
|
` ✓ level=${level} ${chunkName}: inserted=${stats.inserted} updated=${stats.updated} skipped=${stats.skipped}`,
|
|
);
|
|
return stats;
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
console.log('🌏 OSM admin boundaries sync starting');
|
|
console.log(` levels: ${wantedLevels.join(',')}, chunks: ${chunkArg ?? 'all'}, dryRun=${dryRun}`);
|
|
|
|
const chunks = chunkArg
|
|
? { [chunkArg]: CHUNKS[chunkArg]! }
|
|
: CHUNKS;
|
|
|
|
const totals: Record<number, UpsertStats> = {
|
|
4: { inserted: 0, updated: 0, skipped: 0 },
|
|
6: { inserted: 0, updated: 0, skipped: 0 },
|
|
8: { inserted: 0, updated: 0, skipped: 0 },
|
|
};
|
|
|
|
// ALWAYS process levels in order 4 → 6 → 8, because 6 needs province
|
|
// polygons in the DB to assign provinceCode (and 8 needs districts).
|
|
for (const level of wantedLevels.sort() as (4 | 6 | 8)[]) {
|
|
console.log(`\n=== Level ${level} ===`);
|
|
for (const [name, bbox] of Object.entries(chunks)) {
|
|
try {
|
|
const s = await processChunk(level, name, bbox);
|
|
totals[level]!.inserted += s.inserted;
|
|
totals[level]!.updated += s.updated;
|
|
totals[level]!.skipped += s.skipped;
|
|
} catch (err) {
|
|
console.error(` ✗ chunk ${name} (level ${level}) failed:`, err);
|
|
}
|
|
}
|
|
}
|
|
|
|
console.log('\n📊 Totals');
|
|
for (const lvl of wantedLevels) {
|
|
const t = totals[lvl]!;
|
|
console.log(
|
|
` level=${lvl}: inserted=${t.inserted} updated=${t.updated} skipped=${t.skipped}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
main()
|
|
.catch((err) => {
|
|
console.error(err);
|
|
process.exitCode = 1;
|
|
})
|
|
.finally(async () => {
|
|
await prisma.$disconnect();
|
|
await pool.end();
|
|
});
|