Files
goodgo-platform/scripts/sync-osm-industrial-parks.ts
Ho Ngoc Hai fba536406d feat(osm): foundation — admin boundaries, POI catalog, sync orchestrator
This is the Phase 0 + Phase 1 + Phase 4 foundation of the full OSM
integration plan. It backfills three things the rest of the platform
has been faking with hardcoded tables, and gives admins one dashboard
for every OSM-sourced layer.

Phase 0 — Vietnam administrative boundaries
* New columns on vn_provinces / vn_districts / vn_wards: PostGIS
  geometry (MultiPolygon), centroid (Point), areaKm2, osmId, population,
  lastSyncedAt + GIST indexes on geometry/centroid.
* `scripts/sync-osm-admin-boundaries.ts` pulls
  `boundary=administrative + admin_level=4|6|8` from Overpass per chunk,
  filters to mainland VN via the existing country polygon, resolves the
  GSO code (or generates `OSM_<id>`), and upserts via raw SQL because
  Prisma can't manage PostGIS columns.
* `GeoLookupService` (shared module) replaces the old
  `nearestProvince()` heuristic — `lookup(lng,lat)` returns
  province/district/ward via `ST_Contains` on the GIST-indexed polygons.
* The KCN sync now resolves province/district from the polygon table
  and falls back to the centroid heuristic only when polygons aren't
  loaded yet.
* `scripts/backfill-admin-codes.ts` rewrites province/district/ward on
  IndustrialPark, ProjectDevelopment and Property using the new lookup.

Phase 1 — POI catalog (15 categories, schema only here)
* New `Poi` table with `PoiCategory` enum, OSM provenance columns,
  GIST index on `location`. New `TransportLine` for metro/highway
  multilinestrings.
* `scripts/sync-osm-poi.ts` queries Overpass per category × chunk,
  resolves province/district codes from the boundary polygons, upserts
  with `osmLocked` / `lockedFields` honour same as KCN.
* New NestJS `PoiModule` exposes:
    GET /poi/by-bbox    — GeoJSON for map overlays
    GET /poi/nearby     — sidebar "tiện ích xung quanh" (HMAC distance ranks)
    GET /poi/coverage   — admin per-category counts
* New web component `<NearbyPoiSidebar />` ready to drop into listing /
  project / KCN detail pages.

Phase 4 — Sync orchestrator + admin dashboard
* New `OsmSyncRun` audit table tracks every sync invocation
  (RUNNING / SUCCESS / PARTIAL / FAILED + row stats + error message).
* `OsmSyncService` spawns the right tsx script for any (layer, category,
  chunk) tuple, parses stats out of stdout, updates the run row.
* `OsmSyncCronService` schedules:
    Daily 02:00  → POI category rotation (1/day, 20-day cycle)
    Mon  02:30  → admin-boundaries provinces
    Wed  02:30  → admin-boundaries districts
    Sat  02:30  → admin-boundaries wards
    1st of month 03:00 → industrial-parks (per chunk)
  All gated by `OSM_SYNC_ENABLED=true`.
* New admin endpoints under `/admin/osm/*` (layers / coverage / runs /
  trigger), guarded by JWT + ADMIN role.
* New `/admin/osm` Next.js page: stat cards, coverage table with
  per-row "Sync now", recent runs list with auto-refresh every 15s.

Run on dev so far: 33 provinces + 1100+ districts (still finishing) +
305 hospitals POI imported.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 12:01:19 +07:00

488 lines
18 KiB
TypeScript

/**
* OSM → goodgo Industrial Park bulk sync (PR 2/4 of OSM-sync project).
*
* Usage:
* NODE_OPTIONS="-r dotenv/config" DOTENV_CONFIG_PATH=.env \
* pnpm tsx scripts/sync-osm-industrial-parks.ts [--dry-run] [--chunk=north]
*
* Flags:
* --dry-run Fetch from Overpass + show counts, don't write to DB.
* --chunk=NAME Process only one chunk (north|northCentral|southCentral|south).
* Default: process all four.
*
* Strategy:
* • Vietnam is split into 4 horizontal bbox slices (Overpass times out
* on the whole country at once).
* • For each chunk we ask Overpass for `landuse=industrial` ways +
* relations + nodes within the bbox, with inline geometry.
* • osmtogeojson normalises the OSM JSON into a GeoJSON FeatureCollection.
* • For each Feature we upsert a row keyed on `osmId`. The schema
* guarantees uniqueness via the constraint added in PR 1.
* • PostGIS columns (`location` Point + `boundary` MultiPolygon) are
* written via raw SQL because Prisma cannot manage Geometry types.
* • Nodes get `location` only — no boundary.
* • Conflict resolution: respect `osmLocked` and `lockedFields` from PR 1.
*/
import 'dotenv/config';
import { createId } from '@paralleldrive/cuid2';
import { PrismaPg } from '@prisma/adapter-pg';
import { type Prisma, PrismaClient } from '@prisma/client';
import area from '@turf/area';
import centroid from '@turf/centroid';
import type { Feature, MultiPolygon, Polygon, Point } from 'geojson';
import osmtogeojson from 'osmtogeojson';
import pg from 'pg';
import { isPointInVietnam } from './data/vn-country-polygon';
import { nearestProvince } from './data/vn-province-centroids';
const generateCuid = (): Promise<string> => Promise.resolve(createId());
// ─── Setup ────────────────────────────────────────────────────────────────
const pool = new pg.Pool({ connectionString: process.env['DATABASE_URL'] });
const adapter = new PrismaPg(pool);
const prisma = new PrismaClient({ adapter });
const OVERPASS_URL =
process.env['OVERPASS_URL'] ?? 'https://overpass-api.de/api/interpreter';
interface BBox {
south: number;
west: number;
north: number;
east: number;
}
/** Vietnam split horizontally — Overpass timeouts at country scale. */
const CHUNKS: Record<string, BBox> = {
north: { south: 19.0, west: 102.0, north: 23.5, east: 110.0 },
northCentral: { south: 16.5, west: 102.0, north: 19.0, east: 110.0 },
southCentral: { south: 13.0, west: 102.0, north: 16.5, east: 110.0 },
south: { south: 8.0, west: 102.0, north: 13.0, east: 110.0 },
};
// ─── CLI flags ────────────────────────────────────────────────────────────
const argv = process.argv.slice(2);
const dryRun = argv.includes('--dry-run');
const chunkArg = argv.find((a) => a.startsWith('--chunk='))?.slice('--chunk='.length);
// ─── Province / region heuristic from centroid ────────────────────────────
/** Approximate centroid → region using latitude bands. Good enough as a
* fallback when the OSM tags lack `addr:state`; admin can correct later. */
function guessRegion(lat: number): 'NORTH' | 'CENTRAL' | 'SOUTH' {
if (lat >= 19) return 'NORTH';
if (lat >= 13) return 'CENTRAL';
return 'SOUTH';
}
// ─── Helpers ──────────────────────────────────────────────────────────────
function slugify(name: string, osmId: string): string {
const base = name
.toLowerCase()
.replace(/đ/g, 'd')
// strip Vietnamese diacritics
.normalize('NFD')
.replace(/[̀-ͯ]/g, '')
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
.slice(0, 60);
return `${base || 'kcn'}-osm-${osmId}`;
}
interface OverpassResult {
elements: unknown[];
}
async function fetchChunk(name: string, bbox: BBox): Promise<OverpassResult> {
const query = `
[out:json][timeout:180];
(
way["landuse"="industrial"](${bbox.south},${bbox.west},${bbox.north},${bbox.east});
relation["landuse"="industrial"](${bbox.south},${bbox.west},${bbox.north},${bbox.east});
node["landuse"="industrial"](${bbox.south},${bbox.west},${bbox.north},${bbox.east});
);
out body geom;
`;
console.log(` → fetching chunk "${name}" from Overpass…`);
const start = Date.now();
// Overpass expects the query in a `data=` body field with form-encoding.
const res = await fetch(OVERPASS_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'goodgo-osm-sync/1.0 (https://goodgo.vn)',
},
body: 'data=' + encodeURIComponent(query),
});
if (!res.ok) {
const body = await res.text();
throw new Error(`Overpass returned ${res.status}: ${body.slice(0, 200)}`);
}
const json = (await res.json()) as OverpassResult;
console.log(
`${name}: ${json.elements?.length ?? 0} elements in ${(
(Date.now() - start) /
1000
).toFixed(1)}s`,
);
return json;
}
// ─── Per-feature upsert ──────────────────────────────────────────────────
interface ParsedFeature {
osmId: bigint;
osmType: 'NODE' | 'WAY' | 'RELATION';
osmVersion: number | null;
name: string;
nameEn: string | null;
developer: string;
operator: string | null;
tags: Record<string, string>;
centroid: { lng: number; lat: number };
/** GeoJSON Polygon|MultiPolygon as an unparsed JSON string for ST_GeomFromGeoJSON. */
boundaryGeoJson: string | null;
totalAreaHa: number;
province: string;
district: string;
address: string;
}
/** OSM tag keys whose Vietnamese-province values we recognise. */
const VN_PROVINCE_HINTS = ['addr:province', 'addr:state', 'addr:region', 'is_in:province'];
function parseFeature(
feat: Feature<Polygon | MultiPolygon | Point>,
): ParsedFeature | null {
const propsRaw = feat.properties as Record<string, unknown> | null;
if (!propsRaw) return null;
const idStr = String(propsRaw['id'] ?? '');
// osmtogeojson encodes id as "way/123", "relation/456", "node/789"
const slashIdx = idStr.indexOf('/');
if (slashIdx < 0) return null;
const typeRaw = idStr.slice(0, slashIdx).toUpperCase();
if (typeRaw !== 'NODE' && typeRaw !== 'WAY' && typeRaw !== 'RELATION') {
return null;
}
const osmType = typeRaw as 'NODE' | 'WAY' | 'RELATION';
const osmId = BigInt(idStr.slice(slashIdx + 1));
const tagsRaw = propsRaw['tags'];
const tags: Record<string, string> =
tagsRaw && typeof tagsRaw === 'object'
? (tagsRaw as Record<string, string>)
: (propsRaw as Record<string, string>);
const name = tags['name:vi'] ?? tags['name'] ?? null;
// Skip purely unnamed industrial polygons — too noisy for our catalog.
if (!name) return null;
// Skip rows whose names contain zero Latin / Vietnamese letters. This
// catches polygons that bleed across the northern border (Quảng Ninh /
// Lạng Sơn bbox edges) and have only CJK names — those are Chinese
// industrial sites, not VN KCN.
if (!/[A-Za-zÀ-ỹ]/.test(name)) return null;
// We compute the centroid below to know whether to skip; do an early
// bail-out using the country polygon so we don't even allocate a
// ParsedFeature for sites in Laos / Thailand / Cambodia / China.
const operator = tags['operator'] ?? null;
const developer = operator ?? tags['operator:wikidata'] ?? 'Chưa xác định';
// Derive centroid + (optional) boundary GeoJSON.
let cLng: number;
let cLat: number;
let boundaryGeoJson: string | null = null;
let totalAreaHa = 0;
if (feat.geometry.type === 'Point') {
[cLng, cLat] = feat.geometry.coordinates;
} else {
// Polygon or MultiPolygon
const c = centroid(feat as Feature);
[cLng, cLat] = c.geometry.coordinates;
// Normalise to MultiPolygon for storage in the boundary column.
const geom: MultiPolygon =
feat.geometry.type === 'Polygon'
? { type: 'MultiPolygon', coordinates: [feat.geometry.coordinates] }
: feat.geometry;
boundaryGeoJson = JSON.stringify(geom);
// turf/area returns m² — convert to ha (1 ha = 10 000 m²).
totalAreaHa = Math.round((area(feat as Feature) / 10000) * 100) / 100;
}
// Geographic gate: drop sites whose centroid falls outside the Vietnam
// mainland polygon (Laos / Thailand / Cambodia / southern China bleed).
if (!isPointInVietnam(cLng, cLat)) return null;
// Province resolution: prefer explicit OSM tags, then fall back to a
// nearest-centroid lookup against our 63-province table. The actual DB
// upsert step (`upsertFeature`) replaces this with a precise PostGIS
// ST_Contains lookup against `vn_provinces.geometry` once those polygons
// are synced — this is just the bootstrap value used when the polygon
// table is empty.
const province =
VN_PROVINCE_HINTS.map((k) => tags[k]).find(Boolean) ??
tags['addr:city'] ??
nearestProvince(cLat, cLng);
const district = tags['addr:district'] ?? tags['addr:suburb'] ?? '';
const address =
tags['addr:full'] ??
[tags['addr:housenumber'], tags['addr:street']].filter(Boolean).join(' ') ??
'';
const versionRaw = propsRaw['version'];
const osmVersion = typeof versionRaw === 'number' ? versionRaw : null;
return {
osmId,
osmType,
osmVersion,
name,
nameEn: tags['name:en'] ?? null,
developer,
operator,
tags,
centroid: { lng: cLng, lat: cLat },
boundaryGeoJson,
totalAreaHa,
province,
district,
address,
};
}
interface UpsertStats {
inserted: number;
updated: number;
skipped: number;
locked: number;
}
async function upsertFeature(
parsed: ParsedFeature,
stats: UpsertStats,
): Promise<void> {
// Cheap pre-flight: if a row already exists with osmLocked=true, skip.
const existing = await prisma.industrialPark.findUnique({
where: { osmId: parsed.osmId },
select: { id: true, osmLocked: true, lockedFields: true, dataSource: true },
});
if (existing?.osmLocked) {
stats.locked += 1;
return;
}
// Override the heuristic province with a precise PostGIS lookup against
// the OSM-sourced admin polygons (when synced). Falls back to the
// nearest-centroid value already on `parsed.province` if the polygon
// table doesn't yet cover that area.
const adminMatch = await prisma.$queryRawUnsafe<
{ provinceName: string | null; districtName: string | null }[]
>(
`WITH p AS (
SELECT code, name FROM "vn_provinces"
WHERE geometry IS NOT NULL
AND ST_Contains(geometry, ST_SetSRID(ST_MakePoint($1, $2), 4326))
LIMIT 1
)
SELECT
(SELECT name FROM p) AS "provinceName",
(SELECT d.name FROM "vn_districts" d JOIN p ON p.code = d."provinceCode"
WHERE d.geometry IS NOT NULL
AND ST_Contains(d.geometry, ST_SetSRID(ST_MakePoint($1, $2), 4326))
LIMIT 1) AS "districtName"`,
parsed.centroid.lng,
parsed.centroid.lat,
);
const resolvedProvince = adminMatch[0]?.provinceName ?? parsed.province;
const resolvedDistrict = adminMatch[0]?.districtName ?? parsed.district;
parsed.province = resolvedProvince;
if (!parsed.district) parsed.district = resolvedDistrict ?? '';
const region = guessRegion(parsed.centroid.lat);
const slug = slugify(parsed.name, parsed.osmId.toString());
if (!existing) {
// INSERT path via raw SQL — Prisma's create() can't satisfy the
// `location` PostGIS Geometry NOT NULL column (it's `Unsupported`).
const cuid = await generateCuid();
const tagsJson = JSON.stringify(parsed.tags);
const boundarySql = parsed.boundaryGeoJson
? `ST_Multi(ST_GeomFromGeoJSON('${parsed.boundaryGeoJson.replace(/'/g, "''")}'))`
: 'NULL';
// Use a parameterised statement for safety + ST_* exprs inlined.
await prisma.$executeRawUnsafe(
`
INSERT INTO "IndustrialPark" (
id, name, "nameEn", slug, developer, operator, status,
location, address, district, province, region,
"totalAreaHa", "leasableAreaHa", "occupancyRate", "remainingAreaHa",
"tenantCount", "targetIndustries",
"osmId", "osmType", "osmVersion", "osmTags",
"dataSource", "isPublic", "lastSyncedAt", "createdAt", "updatedAt",
boundary
) VALUES (
$1, $2, $3, $4, $5, $6, $7::"IndustrialParkStatus",
ST_SetSRID(ST_MakePoint($8, $9), 4326), $10, $11, $12, $13::"VietnamRegion",
$14, $15, $16, $17,
$18, $19::text[],
$20::bigint, $21::"IndustrialParkOsmType", $22, $23::jsonb,
$24::"IndustrialParkDataSource", $25, $26, NOW(), NOW(),
${boundarySql}
)
`,
cuid,
parsed.name,
parsed.nameEn,
slug,
parsed.developer,
parsed.operator,
'OPERATIONAL',
parsed.centroid.lng,
parsed.centroid.lat,
parsed.address,
parsed.district,
parsed.province,
region,
parsed.totalAreaHa,
parsed.totalAreaHa,
0,
parsed.totalAreaHa,
0,
[],
parsed.osmId.toString(),
parsed.osmType,
parsed.osmVersion,
tagsJson,
'OSM',
false,
new Date(),
);
stats.inserted += 1;
} else {
// UPDATE path. Honour `lockedFields` — preserve listed columns.
const locked = new Set(existing.lockedFields);
const data: Prisma.IndustrialParkUpdateInput = {
osmTags: parsed.tags as Prisma.InputJsonValue,
osmVersion: parsed.osmVersion,
lastSyncedAt: new Date(),
};
if (!locked.has('name')) data.name = parsed.name;
if (!locked.has('nameEn')) data.nameEn = parsed.nameEn;
if (!locked.has('operator')) data.operator = parsed.operator;
if (!locked.has('developer')) data.developer = parsed.developer;
if (!locked.has('address')) data.address = parsed.address;
if (!locked.has('district')) data.district = parsed.district;
if (!locked.has('province')) data.province = parsed.province;
if (!locked.has('region')) data.region = region;
if (!locked.has('totalAreaHa') && parsed.totalAreaHa > 0) {
data.totalAreaHa = parsed.totalAreaHa;
}
await prisma.industrialPark.update({
where: { osmId: parsed.osmId },
data,
});
stats.updated += 1;
}
// For UPDATE path, also refresh the geometry columns (INSERT already
// wrote them inline). Skip for fresh INSERTs we just wrote.
if (existing) {
if (parsed.boundaryGeoJson != null) {
await prisma.$executeRaw`
UPDATE "IndustrialPark"
SET
location = ST_SetSRID(ST_MakePoint(${parsed.centroid.lng}, ${parsed.centroid.lat}), 4326),
boundary = ST_Multi(ST_GeomFromGeoJSON(${parsed.boundaryGeoJson}))
WHERE "osmId" = ${parsed.osmId}
`;
} else {
await prisma.$executeRaw`
UPDATE "IndustrialPark"
SET location = ST_SetSRID(ST_MakePoint(${parsed.centroid.lng}, ${parsed.centroid.lat}), 4326)
WHERE "osmId" = ${parsed.osmId}
`;
}
}
}
// ─── Main ─────────────────────────────────────────────────────────────────
async function main() {
const chunkNames = chunkArg
? [chunkArg]
: (Object.keys(CHUNKS) as Array<keyof typeof CHUNKS>);
const totalStats: UpsertStats = { inserted: 0, updated: 0, skipped: 0, locked: 0 };
let parseSkipped = 0;
console.log(
`🌍 OSM industrial-park sync — chunks: ${chunkNames.join(', ')} ` +
`(dry-run: ${dryRun})\n`,
);
for (const chunkName of chunkNames) {
const bbox = CHUNKS[chunkName as keyof typeof CHUNKS];
if (!bbox) {
console.warn(` ⚠️ unknown chunk "${chunkName}" — skipping`);
continue;
}
const overpass = await fetchChunk(chunkName, bbox);
// osmtogeojson is loosely typed; it accepts the raw Overpass JSON.
const fc = osmtogeojson(overpass) as unknown as {
features: Feature<Polygon | MultiPolygon | Point>[];
};
const parsed: ParsedFeature[] = [];
for (const f of fc.features) {
const p = parseFeature(f);
if (p) parsed.push(p);
else parseSkipped += 1;
}
console.log(
`${chunkName}: ${parsed.length} named industrial parks ` +
`(skipped ${fc.features.length - parsed.length} unnamed/malformed)`,
);
if (dryRun) {
console.log(
` ▸ first 3 names: ${parsed
.slice(0, 3)
.map((p) => p.name)
.join(', ')}`,
);
continue;
}
for (const p of parsed) {
try {
await upsertFeature(p, totalStats);
} catch (err) {
totalStats.skipped += 1;
console.warn(
` ⚠️ upsert failed for ${p.osmType.toLowerCase()}/${p.osmId}: ` +
`${err instanceof Error ? err.message : String(err)}`,
);
}
}
}
console.log('\n✓ Sync complete.');
console.log(
` inserted: ${totalStats.inserted} ` +
`· updated: ${totalStats.updated} ` +
`· locked-skipped: ${totalStats.locked} ` +
`· errors: ${totalStats.skipped} ` +
`· parse-skipped: ${parseSkipped}`,
);
}
main()
.catch((err) => {
console.error('❌ Sync failed:', err);
process.exit(1);
})
.finally(async () => {
await prisma.$disconnect();
await pool.end();
});