diff --git a/apps/status/src/data.ts b/apps/status/src/data.ts index d816f9b..312f09c 100644 --- a/apps/status/src/data.ts +++ b/apps/status/src/data.ts @@ -58,56 +58,6 @@ export interface MonitorRow { latency_history: Array<{ region: string; latency_ms: number | null; ts: string }>; } -// Average latency of the *fastest* region per monitor over a given window. -// Status pages are customer-facing — we want to show our best foot forward, -// not a noisy average that gets dragged down by a single distant region. -export async function loadFastestRegionLatency( - monitorIds: string[], - bucket: BucketType, - intervalLiteral: string, -): Promise> { - const out: Record = {}; - if (monitorIds.length === 0) return out; - for (const id of monitorIds) out[id] = null; - - const ids = sql.array(monitorIds); - let rows = await sql` - SELECT monitor_id, region, - (sum(avg_latency * total) / NULLIF(sum(total), 0))::float AS avg_lat - FROM monitor_uptime_rollup - WHERE monitor_id = ANY(${ids}::text[]) - AND bucket_type = ${bucket} - AND bucket_start > now() - ${intervalLiteral}::interval - AND avg_latency IS NOT NULL - GROUP BY 1, 2 - `; - - if (rows.length === 0) { - // Fallback while rollup is unpopulated. Bounded by the same window so cheap. - rows = await sql` - SELECT monitor_id, COALESCE(region, 'default') AS region, - avg(latency_ms)::float AS avg_lat - FROM pings - WHERE monitor_id = ANY(${ids}::text[]) - AND checked_at > now() - ${intervalLiteral}::interval - AND latency_ms IS NOT NULL - GROUP BY 1, 2 - `; - } - - // For each monitor, keep the region with the lowest average latency. - for (const r of rows) { - if (r.avg_lat == null) continue; - const cur = out[r.monitor_id]; - if (cur == null || r.avg_lat < cur) out[r.monitor_id] = r.avg_lat; - } - // Round to integer ms. - for (const id of Object.keys(out)) { - if (out[id] != null) out[id] = Math.round(out[id] as number); - } - return out; -} - // Single SQL pass that produces all four uptime windows for a set of monitors. // Reads only the rollup table; falls back to a pings aggregate when the rollup // has nothing for these monitors yet (same pattern as loadMonitors). @@ -238,18 +188,20 @@ export async function loadMonitors(pageId: string, window: Window, pageDisplayMo }); } - // Step 3: uptime rollup buckets covering the requested window. + // Step 3: uptime rollup buckets covering the requested window. We keep + // region in the result so JS can pick the fastest region per monitor and + // emit per-bucket latency from just that region (status pages are + // customer-facing, we show our best foot forward). const { bucket, count } = WINDOW_TO_BUCKET[window]; const truncUnit = bucket === "hourly" ? "hour" : "day"; const intervalLiteral = `${count} ${truncUnit}s`; let rollupRows = await sql` - SELECT monitor_id, bucket_start, sum(total)::int AS total, sum(up_count)::int AS up_count, avg(avg_latency)::real AS avg_latency + SELECT monitor_id, region, bucket_start, total, up_count, avg_latency FROM monitor_uptime_rollup WHERE monitor_id = ANY(${sql.array(ids)}::text[]) AND bucket_type = ${bucket} AND bucket_start > date_trunc(${truncUnit}, now()) - ${intervalLiteral}::interval - GROUP BY monitor_id, bucket_start - ORDER BY monitor_id, bucket_start ASC + ORDER BY monitor_id, region, bucket_start ASC `; // Fallback: if the rollup table has nothing for any of these monitors in @@ -257,11 +209,10 @@ export async function loadMonitors(pageId: string, window: Window, pageDisplayMo // silently broken), aggregate directly from pings. Bounded by the window so // it stays cheap. Once the rollup catches up this branch never fires. if (rollupRows.length === 0) { - // Group/order by ordinals — Postgres won't dedupe a $-parameterised - // date_trunc() between SELECT and GROUP BY otherwise. rollupRows = await sql` SELECT monitor_id, + COALESCE(region, 'default') AS region, date_trunc(${truncUnit}, checked_at) AS bucket_start, count(*)::int AS total, count(*) FILTER (WHERE up)::int AS up_count, @@ -269,21 +220,59 @@ export async function loadMonitors(pageId: string, window: Window, pageDisplayMo FROM pings WHERE monitor_id = ANY(${sql.array(ids)}::text[]) AND checked_at > date_trunc(${truncUnit}, now()) - ${intervalLiteral}::interval - GROUP BY 1, 2 - ORDER BY 1, 2 ASC + GROUP BY 1, 2, 3 + ORDER BY 1, 2, 3 ASC `; } - // Index actual rollup data by (monitor_id, isoBucketStart) so we can fill in - // the missing slots below. - const indexed: Record> = {}; + + // Single pass over rollup rows builds three indices: + // indexed[mid][isoStart] → cross-region {total, up} for bar coloring + // regionLat[mid][region] → cross-window weighted latency for picking fastest region + // regionBucketLat[mid][region][isoStart] → per-bucket latency for the fastest-region tooltip lookup + const indexed: Record> = {}; + const regionLat: Record> = {}; + const regionBucketLat: Record>> = {}; for (const r of rollupRows) { const startIso = r.bucket_start instanceof Date ? r.bucket_start.toISOString() : String(r.bucket_start); + + // Cross-region bucket totals (for bar coloring) if (!indexed[r.monitor_id]) indexed[r.monitor_id] = {}; - indexed[r.monitor_id]![startIso] = { total: r.total, up: r.up_count, avg_latency: r.avg_latency ?? null }; + const slot = indexed[r.monitor_id]![startIso] ?? { total: 0, up: 0 }; + slot.total += Number(r.total); + slot.up += Number(r.up_count); + indexed[r.monitor_id]![startIso] = slot; + + // Per-region latency tracking + if (r.avg_latency != null && Number(r.total) > 0) { + if (!regionLat[r.monitor_id]) regionLat[r.monitor_id] = {}; + const acc = regionLat[r.monitor_id]![r.region] ?? { sum: 0, n: 0 }; + acc.sum += Number(r.avg_latency) * Number(r.total); + acc.n += Number(r.total); + regionLat[r.monitor_id]![r.region] = acc; + + if (!regionBucketLat[r.monitor_id]) regionBucketLat[r.monitor_id] = {}; + if (!regionBucketLat[r.monitor_id]![r.region]) regionBucketLat[r.monitor_id]![r.region] = {}; + regionBucketLat[r.monitor_id]![r.region]![startIso] = Math.round(Number(r.avg_latency)); + } + } + + // Pick the fastest region per monitor (lowest weighted average latency over + // the whole window). All per-bucket latency display falls back to this + // region's per-bucket numbers; the per-monitor avg_latency uses the same. + const fastestRegionByMonitor: Record = {}; + const fastestLatency: Record = {}; + for (const id of ids) { + let bestRegion: string | null = null; + let bestAvg = Infinity; + const regions = regionLat[id] ?? {}; + for (const [region, acc] of Object.entries(regions)) { + if (acc.n === 0) continue; + const avg = acc.sum / acc.n; + if (avg < bestAvg) { bestAvg = avg; bestRegion = region; } + } + fastestRegionByMonitor[id] = bestRegion; + fastestLatency[id] = bestRegion != null ? Math.round(bestAvg) : null; } - // Customer-facing latency = average of the fastest region for the page's - // window. Computed via a separate query that retains per-region info. - const fastestLatency = await loadFastestRegionLatency(ids, bucket, intervalLiteral); // Generate the full sequence of expected bucket timestamps so empty bars // render as "no data" instead of disappearing entirely. Truncate `now()` to @@ -303,10 +292,13 @@ export async function loadMonitors(pageId: string, window: Window, pageDisplayMo const bucketsByMonitor: Record = {}; for (const id of ids) { const slotMap = indexed[id] ?? {}; + const bestRegion = fastestRegionByMonitor[id]; + const fastestBuckets = bestRegion ? regionBucketLat[id]?.[bestRegion] ?? {} : {}; bucketsByMonitor[id] = slotIsos.map((iso) => { const hit = slotMap[iso]; + const lat = fastestBuckets[iso] ?? null; return hit - ? { start: iso, total: hit.total, up: hit.up, avg_latency: hit.avg_latency != null ? Math.round(hit.avg_latency) : null } + ? { start: iso, total: hit.total, up: hit.up, avg_latency: lat } : { start: iso, total: 0, up: 0, avg_latency: null }; }); }