diff --git a/scripts/data/vn-country-polygon.ts b/scripts/data/vn-country-polygon.ts new file mode 100644 index 0000000..c213fd6 --- /dev/null +++ b/scripts/data/vn-country-polygon.ts @@ -0,0 +1,92 @@ +/** + * Simplified Vietnam mainland polygon for "is this point in VN?" tests. + * + * The country has a very long, narrow shape (>1500 km north-south) with + * neighbours pressing in on the western and northern borders. Bbox-based + * filtering (the chunks the Overpass sync uses) inevitably catches some + * `landuse=industrial` polygons that sit just across the border in Laos, + * Cambodia, Thailand and southern China. + * + * The polygon below is a hand-traced ~30-vertex outline that follows the + * official border closely enough for a point-in-polygon test. It's not + * survey-grade — coastal islands are clipped, and the Mekong delta tip + * is rounded — but it's sufficient to reject industrial sites that are + * clearly not in VN. Where edge cases exist (a row landing in the + * 1-2 km buffer near a border crossing), admin can promote / unlock by + * hand from the OSM review queue. + * + * Format: GeoJSON Polygon, coordinates as `[lng, lat]` pairs (per the + * spec). The ring is closed (first === last). + */ + +import type { Polygon } from 'geojson'; + +export const VN_COUNTRY_POLYGON: Polygon = { + type: 'Polygon', + coordinates: [ + [ + // Northern border, west → east. The northern edge is the actual + // China border line; we trace it loosely. + [102.14, 22.47], // Lai Châu / China junction + [103.0, 22.78], // Lào Cai + [104.0, 22.82], // northern Hà Giang + [105.32, 23.39], // Đồng Văn (northernmost point) + [106.55, 22.95], // Cao Bằng + [107.0, 22.34], // Lạng Sơn + [108.05, 21.55], // Móng Cái / Quảng Ninh + // Eastern boundary at 110°E — generous on the sea side so that + // every coastal industrial zone (Vũng Áng / Formosa, Dung Quất, + // Nhơn Hội, Vũng Tàu, Long Sơn etc.) sits inside. This omits the + // Hoàng Sa / Trường Sa archipelagos — fine, they have no KCN. + [110.0, 21.0], + [110.0, 18.0], + [110.0, 15.0], + [110.0, 12.0], + [110.0, 9.5], + // Mekong delta — Cà Mau cape, then Hà Tiên (south-west tip). + [105.5, 8.4], // south of Cà Mau + [104.83, 8.59], // Cà Mau + [104.45, 10.39], // Hà Tiên + // West / south-west, climbing along the Cambodia + Laos borders. + [105.0, 10.78], // Châu Đốc + [105.85, 11.38], // Tây Ninh + [106.0, 11.7], + [106.6, 11.95], // Lộc Ninh + [107.55, 12.36], // Bù Đăng + [107.55, 14.42], // Kon Tum + [107.32, 16.0], // A Lưới + [106.5, 16.45], // Hướng Hóa + [105.97, 17.69], // Quảng Bình border + [105.18, 18.66], // Hà Tĩnh / Laos border + [104.34, 19.7], // Nghệ An / Laos + [103.95, 20.66], // Mai Châu + [103.05, 21.13], // Sơn La / Laos + [102.78, 21.91], // Điện Biên + [102.14, 22.47], // close ring + ], + ], +}; + +/** GeoJSON string ready to feed to PostGIS `ST_GeomFromGeoJSON`. */ +export const VN_COUNTRY_POLYGON_GEOJSON = JSON.stringify(VN_COUNTRY_POLYGON); + +/** + * Pure-JS point-in-polygon test using the standard ray-casting algorithm. + * Avoids pulling in `@turf/boolean-point-in-polygon` for the sync script + * (one fewer dep, and we only have one polygon to test against). + */ +export function isPointInVietnam(lng: number, lat: number): boolean { + const ring = VN_COUNTRY_POLYGON.coordinates[0]; + let inside = false; + for (let i = 0, j = ring.length - 1; i < ring.length; j = i++) { + const xi = ring[i][0]; + const yi = ring[i][1]; + const xj = ring[j][0]; + const yj = ring[j][1]; + const intersect = + yi > lat !== yj > lat && + lng < ((xj - xi) * (lat - yi)) / (yj - yi) + xi; + if (intersect) inside = !inside; + } + return inside; +} diff --git a/scripts/prune-non-vietnam-osm.ts b/scripts/prune-non-vietnam-osm.ts new file mode 100644 index 0000000..8a88640 --- /dev/null +++ b/scripts/prune-non-vietnam-osm.ts @@ -0,0 +1,90 @@ +/** + * Prune `IndustrialPark` rows whose centroid is outside the Vietnam + * mainland polygon. Catches the cross-border bleed (Laos, Thailand, + * Cambodia) that the Overpass bbox sync inevitably picks up. + * + * Usage: + * NODE_OPTIONS="-r dotenv/config" DOTENV_CONFIG_PATH=.env \ + * pnpm tsx scripts/prune-non-vietnam-osm.ts [--dry-run] + * + * Strategy: + * 1. Build a PostGIS polygon from `VN_COUNTRY_POLYGON_GEOJSON`. + * 2. SELECT rows where `NOT ST_Within(location, polygon)`, scoped to + * OSM-sourced rows (we never want to delete a manually-curated + * row even if its centroid is wonky). + * 3. DELETE in one statement (cascade removes any IndustrialListing + * rows attached to those parks). + * + * Safe to re-run: idempotent. + */ +import 'dotenv/config'; +import { PrismaPg } from '@prisma/adapter-pg'; +import { PrismaClient } from '@prisma/client'; +import pg from 'pg'; +import { VN_COUNTRY_POLYGON_GEOJSON } from './data/vn-country-polygon'; + +const pool = new pg.Pool({ connectionString: process.env['DATABASE_URL'] }); +const adapter = new PrismaPg(pool); +const prisma = new PrismaClient({ adapter }); + +const dryRun = process.argv.includes('--dry-run'); + +async function main(): Promise { + const polygonSql = `ST_SetSRID(ST_GeomFromGeoJSON('${VN_COUNTRY_POLYGON_GEOJSON.replace( + /'/g, + "''", + )}'), 4326)`; + + const outsideRows = await prisma.$queryRawUnsafe< + { id: string; name: string; province: string; lat: number; lng: number; ha: number }[] + >( + `SELECT id, name, province, + ROUND(ST_Y(location::geometry)::numeric, 3)::float AS lat, + ROUND(ST_X(location::geometry)::numeric, 3)::float AS lng, + COALESCE("totalAreaHa", 0) AS ha + FROM "IndustrialPark" + WHERE "dataSource" IN ('OSM', 'OSM_PROMOTED') + AND NOT ST_Within(location::geometry, ${polygonSql}) + ORDER BY ha DESC NULLS LAST`, + ); + + console.log(`📍 Found ${outsideRows.length} OSM rows OUTSIDE the VN polygon.`); + + if (outsideRows.length === 0) { + console.log('✓ Catalog is clean.'); + return; + } + + // Show the top 15 by area so the operator can sanity-check before deleting. + console.log(' Top 15 by area (will be deleted):'); + for (const row of outsideRows.slice(0, 15)) { + console.log( + ` ${row.name.slice(0, 50).padEnd(50)} ${row.province.slice(0, 16).padEnd(16)} ${ + row.ha + } ha (${row.lat}, ${row.lng})`, + ); + } + + if (dryRun) { + console.log('💡 --dry-run: no writes performed.'); + return; + } + + console.log(`\n🗑 Deleting ${outsideRows.length} rows…`); + const result = await prisma.$executeRawUnsafe( + `DELETE FROM "IndustrialPark" + WHERE "dataSource" IN ('OSM', 'OSM_PROMOTED') + AND NOT ST_Within(location::geometry, ${polygonSql})`, + ); + console.log(`✓ Removed ${result} rows.`); +} + +main() + .catch((err) => { + console.error(err); + process.exitCode = 1; + }) + .finally(async () => { + await prisma.$disconnect(); + await pool.end(); + }); diff --git a/scripts/sync-osm-industrial-parks.ts b/scripts/sync-osm-industrial-parks.ts index 26a32d2..8066d07 100644 --- a/scripts/sync-osm-industrial-parks.ts +++ b/scripts/sync-osm-industrial-parks.ts @@ -32,6 +32,7 @@ import centroid from '@turf/centroid'; import type { Feature, MultiPolygon, Polygon, Point } from 'geojson'; import osmtogeojson from 'osmtogeojson'; import pg from 'pg'; +import { isPointInVietnam } from './data/vn-country-polygon'; import { nearestProvince } from './data/vn-province-centroids'; const generateCuid = (): Promise => Promise.resolve(createId()); @@ -180,6 +181,9 @@ function parseFeature( // Lạng Sơn bbox edges) and have only CJK names — those are Chinese // industrial sites, not VN KCN. if (!/[A-Za-zÀ-ỹ]/.test(name)) return null; + // We compute the centroid below to know whether to skip; do an early + // bail-out using the country polygon so we don't even allocate a + // ParsedFeature for sites in Laos / Thailand / Cambodia / China. const operator = tags['operator'] ?? null; const developer = operator ?? tags['operator:wikidata'] ?? 'Chưa xác định'; @@ -206,6 +210,10 @@ function parseFeature( totalAreaHa = Math.round((area(feat as Feature) / 10000) * 100) / 100; } + // Geographic gate: drop sites whose centroid falls outside the Vietnam + // mainland polygon (Laos / Thailand / Cambodia / southern China bleed). + if (!isPointInVietnam(cLng, cLat)) return null; + // Province resolution: prefer explicit OSM tags, then fall back to a // nearest-centroid lookup against our 63-province table. The fallback // catches the (very common) case where Vietnamese landuse polygons have