Skip to content

Commit a43be14

Browse files
committed
chore: pcc sync worker
Signed-off-by: Uroš Marolt <[email protected]>
1 parent ac322ca commit a43be14

34 files changed

Lines changed: 1266 additions & 32 deletions
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
DROP INDEX IF EXISTS pcc_sync_errors_dedup_idx;
2+
3+
DROP TABLE IF EXISTS pcc_projects_sync_errors;
4+
5+
ALTER TABLE segments DROP COLUMN IF EXISTS maturity;
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
-- Add maturity field to segments for PCC project_maturity_level sync
2+
ALTER TABLE segments ADD COLUMN IF NOT EXISTS maturity TEXT NULL;
3+
4+
-- Catch-all table for PCC sync issues that require manual review
5+
CREATE TABLE IF NOT EXISTS pcc_projects_sync_errors (
6+
id BIGSERIAL PRIMARY KEY,
7+
run_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
8+
external_project_id TEXT,
9+
external_project_slug TEXT,
10+
error_type TEXT NOT NULL,
11+
details JSONB,
12+
resolved BOOLEAN NOT NULL DEFAULT FALSE
13+
);
14+
15+
-- Deduplication index: one unresolved error per (project, error_type).
16+
-- On repeated daily exports the same error upserts in place instead of accumulating rows.
17+
-- Excludes rows where external_project_id IS NULL (e.g. SCHEMA_MISMATCH with no project id).
18+
CREATE UNIQUE INDEX IF NOT EXISTS pcc_sync_errors_dedup_idx
19+
ON pcc_projects_sync_errors (external_project_id, error_type)
20+
WHERE NOT resolved AND external_project_id IS NOT NULL;
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM node:20-bullseye-slim AS builder
2+
3+
RUN apt-get update && apt-get install -y python3 make g++ && rm -rf /var/lib/apt/lists/*
4+
5+
WORKDIR /usr/crowd/app
6+
RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare [email protected] --activate
7+
8+
COPY ./pnpm-workspace.yaml ./pnpm-lock.yaml ./
9+
RUN pnpm fetch
10+
11+
COPY ./services ./services
12+
RUN pnpm i --frozen-lockfile
13+
14+
FROM node:20-bullseye-slim AS runner
15+
16+
RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
17+
18+
WORKDIR /usr/crowd/app
19+
RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare [email protected] --activate
20+
21+
COPY --from=builder /usr/crowd/app/node_modules ./node_modules
22+
COPY --from=builder /usr/crowd/app/services/base.tsconfig.json ./services/base.tsconfig.json
23+
COPY --from=builder /usr/crowd/app/services/libs ./services/libs
24+
COPY --from=builder /usr/crowd/app/services/archetypes/ ./services/archetypes
25+
COPY --from=builder /usr/crowd/app/services/apps/pcc_sync_worker/ ./services/apps/pcc_sync_worker
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
**/.git
2+
**/node_modules
3+
**/venv*
4+
**/.webpack
5+
**/.serverless
6+
**/.env
7+
**/.env.*
8+
**/.idea
9+
**/.vscode
10+
**/dist
11+
.vscode/
12+
.github/
13+
frontend/
14+
scripts/
15+
.flake8
16+
*.md
17+
Makefile
18+
backend/
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
version: '3.1'
2+
3+
x-env-args: &env-args
4+
DOCKER_BUILDKIT: 1
5+
NODE_ENV: docker
6+
SERVICE: pcc-sync-worker
7+
CROWD_TEMPORAL_TASKQUEUE: pccSync
8+
SHELL: /bin/sh
9+
10+
services:
11+
pcc-sync-worker:
12+
build:
13+
context: ../../
14+
dockerfile: ./scripts/services/docker/Dockerfile.pcc_sync_worker
15+
command: 'pnpm run start'
16+
working_dir: /usr/crowd/app/services/apps/pcc_sync_worker
17+
env_file:
18+
- ../../backend/.env.dist.local
19+
- ../../backend/.env.dist.composed
20+
- ../../backend/.env.override.local
21+
- ../../backend/.env.override.composed
22+
environment:
23+
<<: *env-args
24+
restart: always
25+
networks:
26+
- crowd-bridge
27+
28+
pcc-sync-worker-dev:
29+
build:
30+
context: ../../
31+
dockerfile: ./scripts/services/docker/Dockerfile.pcc_sync_worker
32+
command: 'pnpm run dev'
33+
working_dir: /usr/crowd/app/services/apps/pcc_sync_worker
34+
env_file:
35+
- ../../backend/.env.dist.local
36+
- ../../backend/.env.dist.composed
37+
- ../../backend/.env.override.local
38+
- ../../backend/.env.override.composed
39+
environment:
40+
<<: *env-args
41+
hostname: pcc-sync-worker
42+
networks:
43+
- crowd-bridge
44+
volumes:
45+
- ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src
46+
- ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src
47+
- ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src
48+
- ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src
49+
- ../../services/apps/pcc_sync_worker/src:/usr/crowd/app/services/apps/pcc_sync_worker/src
50+
51+
networks:
52+
crowd-bridge:
53+
external: true
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"name": "@crowd/pcc-sync-worker",
3+
"scripts": {
4+
"start": "CROWD_TEMPORAL_TASKQUEUE=pccSync SERVICE=pcc-sync-worker tsx src/index.ts",
5+
"start:debug": "CROWD_TEMPORAL_TASKQUEUE=pccSync SERVICE=pcc-sync-worker LOG_LEVEL=debug tsx src/index.ts",
6+
"start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=pccSync SERVICE=pcc-sync-worker LOG_LEVEL=debug tsx src/index.ts",
7+
"dev": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug",
8+
"dev:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:local",
9+
"lint": "npx eslint --ext .ts src --max-warnings=0",
10+
"format": "npx prettier --write \"src/**/*.ts\"",
11+
"format-check": "npx prettier --check .",
12+
"tsc-check": "tsc --noEmit",
13+
"trigger-export": "SERVICE=pcc-sync-worker tsx src/scripts/triggerExport.ts",
14+
"trigger-cleanup": "SERVICE=pcc-sync-worker tsx src/scripts/triggerCleanup.ts"
15+
},
16+
"dependencies": {
17+
"@crowd/archetype-standard": "workspace:*",
18+
"@crowd/archetype-worker": "workspace:*",
19+
"@crowd/common": "workspace:*",
20+
"@crowd/database": "workspace:*",
21+
"@crowd/logging": "workspace:*",
22+
"@crowd/slack": "workspace:*",
23+
"@crowd/snowflake": "workspace:*",
24+
"@crowd/temporal": "workspace:*",
25+
"@temporalio/client": "~1.11.8",
26+
"@temporalio/workflow": "~1.11.8",
27+
"tsx": "^4.7.1",
28+
"typescript": "^5.6.3"
29+
},
30+
"devDependencies": {
31+
"nodemon": "^3.0.1"
32+
}
33+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database'
2+
import { getServiceChildLogger } from '@crowd/logging'
3+
import { SlackChannel, SlackPersona, sendSlackNotification } from '@crowd/slack'
4+
import { MetadataStore, S3Service } from '@crowd/snowflake'
5+
6+
const log = getServiceChildLogger('cleanupActivity')
7+
8+
const PLATFORM = 'pcc'
9+
10+
export async function executeCleanup(intervalHours = 24): Promise<void> {
11+
const db = await getDbConnection(WRITE_DB_CONFIG())
12+
const metadataStore = new MetadataStore(db)
13+
const s3Service = new S3Service()
14+
15+
const jobs = await metadataStore.getCleanableJobS3Paths(intervalHours, PLATFORM, false)
16+
log.info({ jobCount: jobs.length, intervalHours }, 'Found cleanable PCC jobs')
17+
18+
for (const job of jobs) {
19+
try {
20+
await s3Service.deleteFile(job.s3Path)
21+
await metadataStore.markCleaned(job.id)
22+
log.info({ jobId: job.id, s3Path: job.s3Path }, 'Cleaned PCC job')
23+
} catch (err) {
24+
log.error({ jobId: job.id, s3Path: job.s3Path, err }, 'Failed to clean PCC job, skipping')
25+
sendSlackNotification(
26+
SlackChannel.CDP_INTEGRATIONS_ALERTS,
27+
SlackPersona.ERROR_REPORTER,
28+
'PCC S3 Cleanup Failed',
29+
`Failed to clean job \`${job.id}\` at \`${job.s3Path}\`.\n\n*Error:* ${err instanceof Error ? err.message : err}`,
30+
)
31+
}
32+
}
33+
}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/**
2+
* Export activity: Execute PCC recursive CTE COPY INTO + write metadata.
3+
*
4+
* Full daily export of ANALYTICS.SILVER_DIM.PROJECTS via recursive CTE.
5+
* No incremental logic — at ~1,538 leaf rows, a full daily export is simpler
6+
* and more reliable than incremental (a parent name change would require
7+
* re-exporting all descendants).
8+
*/
9+
import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database'
10+
import { getServiceChildLogger } from '@crowd/logging'
11+
import { MetadataStore, SnowflakeExporter } from '@crowd/snowflake'
12+
13+
const log = getServiceChildLogger('exportActivity')
14+
15+
const PLATFORM = 'pcc'
16+
const SOURCE_NAME = 'project-hierarchy'
17+
18+
function buildSourceQuery(): string {
19+
return `
20+
WITH RECURSIVE project_hierarchy AS (
21+
SELECT project_id, name, description, project_logo, project_status,
22+
project_maturity_level, repository_url, slug, parent_id,
23+
1 AS depth,
24+
name AS depth_1, NULL::VARCHAR AS depth_2, NULL::VARCHAR AS depth_3,
25+
NULL::VARCHAR AS depth_4, NULL::VARCHAR AS depth_5
26+
FROM ANALYTICS.SILVER_DIM.PROJECTS
27+
WHERE parent_id IS NULL
28+
UNION ALL
29+
SELECT p.project_id, p.name, p.description, p.project_logo, p.project_status,
30+
p.project_maturity_level, p.repository_url, p.slug, p.parent_id,
31+
h.depth + 1,
32+
h.depth_1,
33+
CASE WHEN h.depth + 1 = 2 THEN p.name ELSE h.depth_2 END,
34+
CASE WHEN h.depth + 1 = 3 THEN p.name ELSE h.depth_3 END,
35+
CASE WHEN h.depth + 1 = 4 THEN p.name ELSE h.depth_4 END,
36+
CASE WHEN h.depth + 1 = 5 THEN p.name ELSE h.depth_5 END
37+
FROM ANALYTICS.SILVER_DIM.PROJECTS p
38+
INNER JOIN project_hierarchy h ON p.parent_id = h.project_id
39+
)
40+
SELECT ph.project_id, ph.name, ph.slug, ph.description, ph.project_logo, ph.repository_url,
41+
ph.project_status, ph.project_maturity_level, ph.depth,
42+
ph.depth_1, ph.depth_2, ph.depth_3, ph.depth_4, ph.depth_5,
43+
s.segment_id
44+
FROM project_hierarchy ph
45+
LEFT JOIN ANALYTICS.SILVER_DIM.ACTIVE_SEGMENTS s
46+
ON s.source_id = ph.project_id AND s.project_type = 'subproject'
47+
WHERE ph.project_id NOT IN (
48+
SELECT DISTINCT parent_id FROM ANALYTICS.SILVER_DIM.PROJECTS
49+
WHERE parent_id IS NOT NULL
50+
)
51+
`
52+
}
53+
54+
function buildS3FilenamePrefix(): string {
55+
const now = new Date()
56+
const year = now.getFullYear()
57+
const month = String(now.getMonth() + 1).padStart(2, '0')
58+
const day = String(now.getDate()).padStart(2, '0')
59+
const s3BucketPath = process.env.CROWD_SNOWFLAKE_S3_BUCKET_PATH
60+
if (!s3BucketPath) {
61+
throw new Error('Missing required env var CROWD_SNOWFLAKE_S3_BUCKET_PATH')
62+
}
63+
return `${s3BucketPath}/${PLATFORM}/${SOURCE_NAME}/${year}/${month}/${day}`
64+
}
65+
66+
export async function executeExport(): Promise<void> {
67+
log.info({ platform: PLATFORM, sourceName: SOURCE_NAME }, 'Starting PCC export')
68+
69+
const exporter = new SnowflakeExporter()
70+
const db = await getDbConnection(WRITE_DB_CONFIG())
71+
72+
try {
73+
const metadataStore = new MetadataStore(db)
74+
const sourceQuery = buildSourceQuery()
75+
const s3FilenamePrefix = buildS3FilenamePrefix()
76+
const exportStartedAt = new Date()
77+
78+
const onBatchComplete = async (s3Path: string, totalRows: number, totalBytes: number) => {
79+
await metadataStore.insertExportJob(
80+
PLATFORM,
81+
SOURCE_NAME,
82+
s3Path,
83+
totalRows,
84+
totalBytes,
85+
exportStartedAt,
86+
)
87+
}
88+
89+
await exporter.executeBatchedCopyInto(sourceQuery, s3FilenamePrefix, onBatchComplete)
90+
91+
log.info({ platform: PLATFORM, sourceName: SOURCE_NAME }, 'PCC export completed')
92+
} catch (err) {
93+
log.error({ platform: PLATFORM, sourceName: SOURCE_NAME, err }, 'PCC export failed')
94+
throw err
95+
} finally {
96+
await exporter
97+
.destroy()
98+
.catch((err) => log.warn({ err }, 'Failed to close Snowflake connection'))
99+
}
100+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
export { executeExport } from './exportActivity'
2+
export { executeCleanup } from './cleanupActivity'
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
/**
2+
* Centralized configuration: Temporal.
3+
*/
4+
5+
export { TEMPORAL_CONFIG, getTemporalClient } from '@crowd/temporal'
6+
export type { ITemporalConfig } from '@crowd/temporal'

0 commit comments

Comments
 (0)