pingql/apps/status/src/data.ts

587 lines
23 KiB
TypeScript

// Loads the read-only data needed to render a public status page. NEVER reads
// the raw `pings` table - uses `monitor_region_state` for current state and
// `monitor_uptime_rollup` for historical uptime windows.
import sql from "./db";
export type BucketType = "hourly" | "daily";
export interface StatusPageRow {
id: string;
account_id: string;
slug: string;
title: string;
description: string | null;
theme: "auto" | "light" | "dark";
password_hash: string | null;
index_search: boolean;
show_powered_by: boolean;
show_response_time:boolean;
bar_frequency: BucketType;
bar_count: number;
custom_domain: string | null;
custom_css: string | null;
footer_text: string | null;
auto_refresh_s: number;
}
export interface MonitorRow {
id: string;
display_name: string;
// Note: the underlying monitor.url is intentionally NOT exposed on the
// public payload. Status pages display `display_name`; the literal target
// URL (which can contain auth tokens, internal hostnames, staging paths,
// etc.) must never leak to anonymous visitors via the JSON endpoint.
// Group correlator. Emitted as the matching group's `position` index
// (0-based string), NOT the underlying UUID, so the JSON doesn't leak
// internal IDs. The HTML render works the same either way - it just
// looks up groups by this token.
group_id: string | null;
position: number;
// 'paused' means the monitor was disabled in the dashboard - the runner has
// stopped checking it, and the public page should treat it as planned
// maintenance rather than an outage.
current_state: "up" | "down" | "unknown" | "paused";
region_states: Array<{ region: string; state: "up" | "down" | "unknown"; updated_at: string | null }>;
uptime_pct: number | null;
buckets: Array<{ start: string; total: number; up: number; avg_latency: number | null }>; // bar chart input
avg_latency: number | null;
latency_history: Array<{ region: string; latency_ms: number | null; ts: string }>;
}
// Multi-window uptime (24h / 7d / 30d / 90d) is now derived from the same
// rollup row set that loadMonitors pulls for the bar chart - see the in-JS
// aggregation pass below. This used to be a second SQL round-trip running
// four FILTER aggregates that redid arithmetic the raw bucket rows already
// contained.
export interface GroupRow {
id: string;
name: string;
position: number;
}
export interface IncidentUpdateRow {
id: string;
status: string;
body_html: string;
created_at: string;
}
export interface IncidentSummary {
id: string;
title: string;
status: string;
severity: string;
pinned: boolean;
started_at: string;
resolved_at: string | null;
updates: IncidentUpdateRow[]; // full timeline, newest first
}
export async function loadStatusPage(slug: string): Promise<StatusPageRow | null> {
const [row] = await sql<StatusPageRow[]>`SELECT * FROM status_pages WHERE slug = ${slug}`;
return row ?? null;
}
export async function loadStatusPageByDomain(domain: string): Promise<StatusPageRow | null> {
const [row] = await sql<StatusPageRow[]>`SELECT * FROM status_pages WHERE custom_domain = ${domain}`;
return row ?? null;
}
export async function verifyDomain(domain: string): Promise<boolean> {
const [row] = await sql<{ id: string }[]>`SELECT id FROM status_pages WHERE custom_domain = ${domain}`;
return !!row;
}
export async function loadGroups(pageId: string): Promise<GroupRow[]> {
return sql<GroupRow[]>`
SELECT id, name, position FROM status_page_groups
WHERE status_page_id = ${pageId}
ORDER BY position ASC, name ASC
`;
}
export async function loadMonitors(
pageId: string,
barFrequency: BucketType = "daily",
barCount: number = 90,
): Promise<MonitorRow[]> {
// Step 1: page → monitors with display overrides + group + position. Pull
// m.enabled too so we can render disabled monitors as "Maintenance" on the
// public page (the runner stops checking them when disabled, so their
// region_states would otherwise drift to a stale "up" - visitors should
// see this as planned downtime, not phantom uptime).
// Deliberately do NOT select m.url - see the MonitorRow comment for why the
// raw target URL must never reach the public payload.
const monitorRows = await sql<any[]>`
SELECT
spm.monitor_id AS id,
COALESCE(spm.display_name, m.name) AS display_name,
m.enabled AS enabled,
spm.group_id,
spm.position
FROM status_page_monitors spm
JOIN monitors m ON m.id = spm.monitor_id
WHERE spm.status_page_id = ${pageId}
ORDER BY spm.position ASC, m.name ASC
`;
if (monitorRows.length === 0) return [];
const ids = monitorRows.map((r) => r.id);
// Step 2: per-region current state for these monitors.
const stateRows = await sql<{ monitor_id: string; region: string; last_state: string | null; updated_at: string }[]>`
SELECT monitor_id, region, last_state, updated_at
FROM monitor_region_state
WHERE monitor_id = ANY(${sql.array(ids)}::text[])
`;
const stateByMonitor: Record<string, MonitorRow["region_states"]> = {};
for (const s of stateRows) {
if (!stateByMonitor[s.monitor_id]) stateByMonitor[s.monitor_id] = [];
stateByMonitor[s.monitor_id]!.push({
region: s.region,
state: (s.last_state as any) ?? "unknown",
updated_at: s.updated_at,
});
}
// Step 3: ONE unified rollup query covering everything we need:
// - bar chart: bucket_type = barFrequency, last barCount buckets
// - multi-window uptime: hourly back 24h + daily back 90d
// - latency sparkline: hourly back 30h
//
// Union of all of those is "hourly back N hours OR daily back N days" with
// N chosen to cover whichever consumer needs the widest window. The rows
// are then partitioned by purpose entirely in JS - no second round-trip,
// no duplicate FILTER aggregates inside Postgres.
const bucket: BucketType = barFrequency;
const count = Math.max(1, Math.min(180, barCount));
// Hourly span has to cover the latency sparkline (30h) AND the bar chart if
// it's hourly (up to 180h). +2h slack so the truncated bucket boundary at
// the start of the window is included even if we cross an hour during the
// request.
const hourlyBackHours = Math.max(30, bucket === "hourly" ? count : 0) + 2;
// Daily span has to cover multi-window uptime (90d) AND the bar chart if
// it's daily (up to 180d). +1d slack for the same reason.
const dailyBackDays = Math.max(90, bucket === "daily" ? count : 0) + 1;
const hourlyInterval = `${hourlyBackHours} hours`;
const dailyInterval = `${dailyBackDays} days`;
let rollupRows = await sql<any[]>`
SELECT monitor_id, region, bucket_type, bucket_start, total, up_count, avg_latency
FROM monitor_uptime_rollup
WHERE monitor_id = ANY(${sql.array(ids)}::text[])
AND (
(bucket_type = 'hourly' AND bucket_start > now() - ${hourlyInterval}::interval)
OR
(bucket_type = 'daily' AND bucket_start > now() - ${dailyInterval}::interval)
)
ORDER BY monitor_id, bucket_type, region, bucket_start ASC
`;
// Fallback: if the rollup table has nothing for any of these monitors in
// either bucket type (cold deploy, broken job), aggregate directly from
// pings. Produces both bucket types via UNION ALL so downstream JS doesn't
// need to know which path it came from. Bounded by the wider of the two
// windows so it stays cheap. Once the rollup catches up this never fires.
if (rollupRows.length === 0) {
rollupRows = await sql<any[]>`
(
SELECT
monitor_id,
COALESCE(region, 'default') AS region,
'hourly'::text AS bucket_type,
date_trunc('hour', checked_at) AS bucket_start,
count(*)::int AS total,
count(*) FILTER (WHERE up)::int AS up_count,
avg(latency_ms)::real AS avg_latency
FROM pings
WHERE monitor_id = ANY(${sql.array(ids)}::text[])
AND checked_at > now() - ${hourlyInterval}::interval
GROUP BY 1, 2, 4
)
UNION ALL
(
SELECT
monitor_id,
COALESCE(region, 'default') AS region,
'daily'::text AS bucket_type,
date_trunc('day', checked_at) AS bucket_start,
count(*)::int AS total,
count(*) FILTER (WHERE up)::int AS up_count,
avg(latency_ms)::real AS avg_latency
FROM pings
WHERE monitor_id = ANY(${sql.array(ids)}::text[])
AND checked_at > now() - ${dailyInterval}::interval
GROUP BY 1, 2, 4
)
`;
}
// Single pass over the unified rows builds every index we need:
// barIndexed[mid][isoStart] → cross-region {total, up} for bar coloring (only rows of barFrequency)
// barRegionLat[mid][region] → weighted latency over the bar window for picking fastest region
// barRegionBucketLat[mid][region][iso] → per-bucket latency in the fastest region (only rows of barFrequency)
// latByMonitor[mid][] → 30h hourly latency sparkline rows
const barIndexed: Record<string, Record<string, { total: number; up: number }>> = {};
const barRegionLat: Record<string, Record<string, { sum: number; n: number }>> = {};
const barRegionBucketLat: Record<string, Record<string, Record<string, number>>> = {};
const latByMonitor: Record<string, MonitorRow["latency_history"]> = {};
const nowMs = Date.now();
const ms30h = 30 * 3600_000;
for (const r of rollupRows) {
const startDate = r.bucket_start instanceof Date ? r.bucket_start : new Date(r.bucket_start);
const startIso = startDate.toISOString();
const startMs = startDate.getTime();
const total = Number(r.total);
const up = Number(r.up_count);
const avgLat = r.avg_latency == null ? null : Number(r.avg_latency);
const mid = r.monitor_id;
const bt: BucketType = r.bucket_type;
// Bar chart accumulators - only rows matching the configured bar frequency.
if (bt === bucket) {
if (!barIndexed[mid]) barIndexed[mid] = {};
const slot = barIndexed[mid]![startIso] ?? { total: 0, up: 0 };
slot.total += total;
slot.up += up;
barIndexed[mid]![startIso] = slot;
if (avgLat != null && total > 0) {
if (!barRegionLat[mid]) barRegionLat[mid] = {};
const acc = barRegionLat[mid]![r.region] ?? { sum: 0, n: 0 };
acc.sum += avgLat * total;
acc.n += total;
barRegionLat[mid]![r.region] = acc;
if (!barRegionBucketLat[mid]) barRegionBucketLat[mid] = {};
if (!barRegionBucketLat[mid]![r.region]) barRegionBucketLat[mid]![r.region] = {};
barRegionBucketLat[mid]![r.region]![startIso] = Math.round(avgLat);
}
}
// 30h hourly latency sparkline.
if (bt === "hourly" && nowMs - startMs < ms30h) {
if (!latByMonitor[mid]) latByMonitor[mid] = [];
latByMonitor[mid]!.push({
region: r.region,
latency_ms: avgLat == null ? null : Math.round(avgLat),
ts: startIso,
});
}
}
// Sort the latency sparkline rows by ts ASC per monitor (the unified query
// sorts by bucket_type then region then bucket_start, so the per-monitor
// hourly subset is already ordered within a region but interleaved across
// regions - this normalises it the same way the old separate query did).
for (const mid of Object.keys(latByMonitor)) {
latByMonitor[mid]!.sort((a, b) => a.ts.localeCompare(b.ts));
}
// Pick the fastest region per monitor over the bar window (lowest weighted
// average latency). Per-bucket latency display + the per-monitor avg_latency
// both come from the chosen region.
const fastestRegionByMonitor: Record<string, string | null> = {};
const fastestLatency: Record<string, number | null> = {};
for (const id of ids) {
let bestRegion: string | null = null;
let bestAvg = Infinity;
const regions = barRegionLat[id] ?? {};
for (const [region, acc] of Object.entries(regions)) {
if (acc.n === 0) continue;
const avg = acc.sum / acc.n;
if (avg < bestAvg) { bestAvg = avg; bestRegion = region; }
}
fastestRegionByMonitor[id] = bestRegion;
fastestLatency[id] = bestRegion != null ? Math.round(bestAvg) : null;
}
// Generate the full sequence of expected bucket timestamps so empty bars
// render as "no data" instead of disappearing entirely. Truncate `now()`
// to the unit so the slot boundaries line up with what the rollup writes.
const bucketMs = bucket === "hourly" ? 3600_000 : 86_400_000;
const truncate = (d: Date): Date => {
const t = new Date(d);
if (bucket === "hourly") t.setUTCMinutes(0, 0, 0);
else t.setUTCHours(0, 0, 0, 0);
return t;
};
const nowTrunc = truncate(new Date()).getTime();
const slotIsos: string[] = [];
for (let i = count - 1; i >= 0; i--) {
slotIsos.push(new Date(nowTrunc - i * bucketMs).toISOString());
}
const bucketsByMonitor: Record<string, MonitorRow["buckets"]> = {};
for (const id of ids) {
const slotMap = barIndexed[id] ?? {};
const bestRegion = fastestRegionByMonitor[id];
const fastestBuckets = bestRegion ? barRegionBucketLat[id]?.[bestRegion] ?? {} : {};
bucketsByMonitor[id] = slotIsos.map((iso) => {
const hit = slotMap[iso];
const lat = fastestBuckets[iso] ?? null;
return hit
? { start: iso, total: hit.total, up: hit.up, avg_latency: lat }
: { start: iso, total: 0, up: 0, avg_latency: null };
});
}
const latencyByMonitorList = latByMonitor;
return monitorRows.map((m) => {
const region_states = stateByMonitor[m.id] ?? [];
let current_state: MonitorRow["current_state"] = "unknown";
if (region_states.length > 0) {
const anyDown = region_states.some((s) => s.state === "down");
const anyUp = region_states.some((s) => s.state === "up");
current_state = anyDown ? "down" : anyUp ? "up" : "unknown";
}
// A disabled monitor is in operator-declared maintenance - runner has
// stopped checking it. Override whatever the last region state was so the
// public page reads "Maintenance" instead of a stale "Operational".
if (m.enabled === false) current_state = "paused";
const buckets = bucketsByMonitor[m.id] ?? [];
let uptime_pct: number | null = null;
if (buckets.length > 0) {
const tot = buckets.reduce((a, b) => a + b.total, 0);
const upT = buckets.reduce((a, b) => a + b.up, 0);
// Full precision - the display layer truncates (not rounds) to 2 decimals
// so any downtime, however small, never visually rounds up to 100%.
uptime_pct = tot > 0 ? (100 * upT / tot) : null;
}
const avg_latency = fastestLatency[m.id] ?? null;
return {
id: m.id,
display_name: m.display_name,
group_id: m.group_id,
position: m.position,
current_state,
region_states,
uptime_pct,
buckets,
avg_latency,
latency_history: latencyByMonitorList[m.id] ?? [],
} as MonitorRow;
});
}
export async function loadIncidents(pageId: string): Promise<{ active: IncidentSummary[]; recent: IncidentSummary[] }> {
const incidents = await sql<any[]>`
SELECT i.*
FROM incidents i
JOIN incident_status_pages isp ON isp.incident_id = i.id
WHERE isp.status_page_id = ${pageId}
ORDER BY i.started_at DESC
LIMIT 50
`;
if (incidents.length === 0) return { active: [], recent: [] };
const ids = incidents.map((i) => i.id);
// Full timeline per incident (newest first), so the public page can show the
// entire course of events on both active and resolved incidents.
const allUpdates = await sql<any[]>`
SELECT id, incident_id, status, body_html, created_at
FROM incident_updates
WHERE incident_id = ANY(${sql.array(ids)}::uuid[])
ORDER BY created_at DESC
`;
const updatesByIncident: Record<string, IncidentUpdateRow[]> = {};
for (const u of allUpdates) {
if (!updatesByIncident[u.incident_id]) updatesByIncident[u.incident_id] = [];
updatesByIncident[u.incident_id]!.push({
id: u.id,
status: u.status,
body_html: u.body_html,
created_at: u.created_at instanceof Date ? u.created_at.toISOString() : String(u.created_at),
});
}
const enriched: IncidentSummary[] = incidents.map((i) => ({
id: i.id,
title: i.title,
status: i.status,
severity: i.severity,
pinned: i.pinned,
started_at: i.started_at instanceof Date ? i.started_at.toISOString() : String(i.started_at),
resolved_at: i.resolved_at ? (i.resolved_at instanceof Date ? i.resolved_at.toISOString() : String(i.resolved_at)) : null,
updates: updatesByIncident[i.id] ?? [],
}));
const active = enriched.filter((i) => i.pinned && !i.resolved_at);
const recent = enriched.filter((i) => !active.includes(i));
return { active, recent };
}
export interface MonitorDetailPayload {
monitor: MonitorRow;
incidents: IncidentSummary[]; // recent incidents that touch this monitor
generated_at: string;
}
export async function loadMonitorDetail(slug: string, monitorId: string): Promise<MonitorDetailPayload | null> {
const page = await loadStatusPage(slug);
if (!page) return null;
// Existence check only - confirm the monitor is actually attached to this
// page. The bulk loader below produces the full payload; this query exists
// purely so we can return null on a wrong slug/monitor combo without firing
// the bigger query at all.
const [link] = await sql<any[]>`
SELECT 1
FROM status_page_monitors spm
WHERE spm.status_page_id = ${page.id} AND spm.monitor_id = ${monitorId}
`;
if (!link) return null;
// Reuse the bulk loader with a single-monitor list - keeps the bucket/state
// logic in one place. Cheap because we're querying for one ID. We also need
// the page's groups so we can redact the monitor's group_id (UUID → public
// position-as-string token), matching what /:slug.json emits.
const [allGroups, allMonitors] = await Promise.all([
loadGroups(page.id),
loadMonitors(page.id, page.bar_frequency, page.bar_count),
]);
const { monitors } = redactGroupsAndMonitors(allGroups, allMonitors);
const m = monitors.find((x) => x.id === monitorId);
if (!m) return null;
// Incidents touching this monitor (any status), most recent 20, full timeline.
const incidentRows = await sql<any[]>`
SELECT i.*
FROM incidents i
JOIN incident_monitors im ON im.incident_id = i.id
WHERE im.monitor_id = ${monitorId} AND i.account_id = ${page.account_id}
ORDER BY i.started_at DESC
LIMIT 20
`;
let incidents: IncidentSummary[] = [];
if (incidentRows.length > 0) {
const ids = incidentRows.map((i) => i.id);
const allUpdates = await sql<any[]>`
SELECT id, incident_id, status, body_html, created_at
FROM incident_updates
WHERE incident_id = ANY(${sql.array(ids)}::uuid[])
ORDER BY created_at DESC
`;
const updatesByIncident: Record<string, IncidentUpdateRow[]> = {};
for (const u of allUpdates) {
if (!updatesByIncident[u.incident_id]) updatesByIncident[u.incident_id] = [];
updatesByIncident[u.incident_id]!.push({
id: u.id,
status: u.status,
body_html: u.body_html,
created_at: u.created_at instanceof Date ? u.created_at.toISOString() : String(u.created_at),
});
}
incidents = incidentRows.map((i) => ({
id: i.id,
title: i.title,
status: i.status,
severity: i.severity,
pinned: i.pinned,
started_at: i.started_at instanceof Date ? i.started_at.toISOString() : String(i.started_at),
resolved_at: i.resolved_at ? (i.resolved_at instanceof Date ? i.resolved_at.toISOString() : String(i.resolved_at)) : null,
updates: updatesByIncident[i.id] ?? [],
}));
}
return { monitor: m, incidents, generated_at: new Date().toISOString() };
}
// The shape we actually expose to anonymous visitors. Computed by stripping
// internal IDs and any field a public consumer doesn't need from the row
// types - see redactPageForPublic / redactGroupsAndMonitors below.
export interface PublicPageView {
slug: string;
title: string;
description: string | null;
theme: "auto" | "light" | "dark";
index_search: boolean;
show_powered_by: boolean;
show_response_time: boolean;
bar_frequency: BucketType;
bar_count: number;
footer_text: string | null;
auto_refresh_s: number;
has_password: boolean;
}
export interface PublicGroupView {
id: string; // re-keyed to position-as-string, NOT the underlying UUID
name: string;
position: number;
}
export interface PagePayload {
page: PublicPageView;
groups: PublicGroupView[];
monitors: MonitorRow[];
incidents: { active: IncidentSummary[]; recent: IncidentSummary[] };
generated_at: string;
}
// Strip everything that doesn't belong on an unauthenticated payload:
// - account_id leaks the customer identifier across the platform
// - id internal status_page UUID, no consumer needs it
// - password_hash obvious
function redactPageForPublic(p: StatusPageRow): PublicPageView {
return {
slug: p.slug,
title: p.title,
description: p.description,
theme: p.theme,
index_search: p.index_search,
show_powered_by: p.show_powered_by,
show_response_time: p.show_response_time,
bar_frequency: p.bar_frequency,
bar_count: p.bar_count,
footer_text: p.footer_text,
auto_refresh_s: p.auto_refresh_s,
has_password: !!p.password_hash,
};
}
// Replace each group's UUID with its position-as-string. Monitors carry the
// same token in their group_id field, so the consumer can still join them
// - they just see opaque "0", "1", "2" tokens instead of internal UUIDs.
function redactGroupsAndMonitors(
groups: GroupRow[],
monitors: MonitorRow[],
): { groups: PublicGroupView[]; monitors: MonitorRow[] } {
const idMap = new Map<string, string>();
groups.forEach((g, i) => idMap.set(g.id, String(i)));
const publicGroups: PublicGroupView[] = groups.map((g, i) => ({
id: String(i),
name: g.name,
position: g.position,
}));
const publicMonitors = monitors.map((m) => ({
...m,
group_id: m.group_id ? (idMap.get(m.group_id) ?? null) : null,
}));
return { groups: publicGroups, monitors: publicMonitors };
}
export async function loadPagePayload(slug: string): Promise<PagePayload | null> {
const page = await loadStatusPage(slug);
if (!page) return null;
const [rawGroups, rawMonitors, incidents] = await Promise.all([
loadGroups(page.id),
loadMonitors(page.id, page.bar_frequency, page.bar_count),
loadIncidents(page.id),
]);
const { groups, monitors } = redactGroupsAndMonitors(rawGroups, rawMonitors);
return {
page: redactPageForPublic(page),
groups,
monitors,
incidents,
generated_at: new Date().toISOString(),
};
}