Skip to content

Commit c871e11

Browse files
authored
fix: use category name instead of uuid generated by llm (CM-889) (#3822)
1 parent 882fdbe commit c871e11

1 file changed

Lines changed: 148 additions & 59 deletions

File tree

services/apps/categorization_worker/src/activities/activities.ts

Lines changed: 148 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,56 @@ import {
1313
import { svc } from '../main'
1414
import { IFindCategoryParams, IFindCollectionsParams, IListedCategory } from '../types'
1515

16-
function formatTextCategoriesForPrompt(categories: IListedCategory[]): string {
17-
const groupedCategories = new Map<string, string[]>()
18-
19-
for (const category of categories) {
20-
const groupName = category.categoryGroupName
21-
if (!groupedCategories.has(groupName)) {
22-
groupedCategories.set(groupName, [])
23-
}
24-
groupedCategories.get(groupName).push(category.name + '-' + category.id)
16+
function validateAndCorrectLLMItems<T extends { name: string; id: string }>(
17+
llmItems: T[],
18+
databaseItems: T[],
19+
itemType: string,
20+
): T[] {
21+
if (!llmItems || llmItems.length === 0) {
22+
return []
2523
}
2624

27-
let categoriesText = ''
28-
for (const [groupName, names] of groupedCategories) {
29-
categoriesText += `## ${groupName}\n`
30-
for (const name of names) {
31-
categoriesText += `- ${name}\n`
32-
}
33-
categoriesText += '\n'
34-
}
35-
return categoriesText.trim()
25+
const validUuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
26+
27+
return llmItems
28+
.map((llmItem) => {
29+
// Try to find by ID if UUID is valid
30+
if (validUuidRegex.test(llmItem.id)) {
31+
const dbItem = databaseItems.find((item) => item.id === llmItem.id)
32+
if (dbItem) {
33+
return { name: dbItem.name, id: dbItem.id } as T
34+
}
35+
svc.log.warn(`${itemType} UUID "${llmItem.id}" not found in database, trying name lookup`)
36+
} else {
37+
svc.log.warn(
38+
`${itemType} has invalid UUID format: "${llmItem.id}" (length: ${llmItem.id?.length || 0}), trying name lookup`,
39+
)
40+
}
41+
42+
// Fallback: try to find by name
43+
const dbItem = llmItem.name
44+
? databaseItems.find((item) => item.name.toLowerCase() === llmItem.name.toLowerCase())
45+
: null
46+
47+
if (dbItem) {
48+
svc.log.info(`Found ${itemType} "${llmItem.name}" by name, using DB UUID "${dbItem.id}"`)
49+
return { name: dbItem.name, id: dbItem.id } as T
50+
}
51+
52+
svc.log.warn(`${itemType} "${llmItem.name}" not found in database, skipping`)
53+
return null
54+
})
55+
.filter(Boolean)
56+
}
57+
58+
function formatTextCategoriesForPrompt(categories: IListedCategory[]): string {
59+
const categoryObjects = categories.map((category) => ({
60+
name: category.name,
61+
id: category.id,
62+
categoryGroupName: category.categoryGroupName,
63+
}))
64+
65+
return JSON.stringify(categoryObjects, null, 2)
3666
}
3767

3868
function formatTextCollectionsForPrompt(
@@ -87,60 +117,75 @@ export async function findCategoriesWithLLM({
87117
})
88118

89119
const prompt = `
120+
You are an expert open-source analyst. Your job is to classify ${github} into appropriate categories.
90121
91-
You are an expert open-source analyst. Your job is to classify ${github} into appropriate categories.
92-
93-
## Context and Purpose
94-
This classification is part of the Open Source Index, a comprehensive catalog of the most critical open-source projects.
95-
Developers and organizations use this index to:
96-
- Discover relevant open-source tools for their technology stack
97-
- Understand the open-source ecosystem in their domain
98-
- Make informed decisions about which projects to adopt or contribute to
99-
- Assess the health and importance of projects in specific technology areas
122+
## Context and Purpose
123+
This classification is part of the Open Source Index, a comprehensive catalog of the most critical open-source projects.
124+
Developers and organizations use this index to:
125+
- Discover relevant open-source tools for their technology stack
126+
- Understand the open-source ecosystem in their domain
127+
- Make informed decisions about which projects to adopt or contribute to
128+
- Assess the health and importance of projects in specific technology areas
100129
101-
Accurate categorization is essential for users to find the right projects when browsing by technology domain or industry vertical.
130+
Accurate categorization is essential for users to find the right projects when browsing by technology domain or industry vertical.
102131
103-
## Project Information
104-
- URL: ${github}
105-
- Description: ${description}
106-
- Topics: ${topics}
107-
- Homepage: ${website}
132+
## Project Information
133+
- URL: ${github}
134+
- Description: ${description}
135+
- Topics: ${topics}
136+
- Homepage: ${website}
108137
109-
## Available Categories
110-
These categories are organized by category groups and each category is shown as "CategoryName-CategoryID":
138+
## Available Categories (AUTHORITATIVE, CLOSED SET)
139+
The following categories are the ONLY valid options.
140+
They are provided as a JSON array of immutable objects.
141+
Every valid category is exactly one object in this array:
111142
112-
${formatTextCategoriesForPrompt(categories)}
143+
${formatTextCategoriesForPrompt(categories)}
113144
114-
## Your Task
115-
Analyze the project and determine which categories it belongs to. A project can belong to multiple categories if appropriate.
145+
## NON-NEGOTIABLE OUTPUT CONSTRAINTS (MUST FOLLOW)
146+
- You MUST select categories ONLY from the JSON array above.
147+
- You MUST NOT invent categories.
148+
- You MUST NOT generate new ids.
149+
- You MUST NOT retype, rephrase, normalize, translate, or modify ANY character
150+
of any selected category object's "name" or "id".
151+
- The output "categories" MUST be a subset of objects copied EXACTLY from the array above.
152+
- If you cannot comply perfectly, return {"categories": []}.
116153
117-
Consider:
118-
- The project's primary functionality and purpose
119-
- The technology domain it operates in
120-
- The industry or vertical it serves (if applicable)
121-
- How developers would expect to find this project when browsing by category
154+
### MANDATORY SELF-CHECK BEFORE FINAL OUTPUT
155+
For each object you plan to output in "categories":
156+
1) Confirm there is an IDENTICAL object in the provided JSON array (same "name" string, same "id" string).
157+
2) If not identical, REMOVE it (do not replace it).
122158
123-
If the project doesn't clearly fit into any of the available categories, return an empty array for categories.
159+
## Your Task
160+
Analyze the project and determine which categories it belongs to.
161+
A project can belong to multiple categories if appropriate.
124162
125-
## Format
126-
Respond with a valid JSON object **only**. Do not include any explanations, markdown formatting, or extra text.
163+
Consider:
164+
- The project's primary functionality and purpose
165+
- The technology domain it operates in
166+
- The industry or vertical it serves (if applicable)
167+
- How developers would expect to find this project when browsing by category
127168
128-
If the project fits one or more categories:
129-
{
130-
"categories": [
131-
{ "name": "CategoryName", "id": "CategoryID" },
132-
{ "name": "AnotherCategory", "id": "AnotherID" }
133-
],
134-
"explanation": "Brief explanation of why you chose these categories"
135-
}
169+
If the project doesn't clearly fit into any of the available categories, return an empty array for categories.
136170
137-
If the project does not clearly fit any category:
138-
{
139-
"categories": []
140-
}
171+
## Format
172+
Respond with a valid JSON object ONLY.
173+
Do not include explanations outside the JSON.
174+
Do not include markdown formatting or extra text.
141175
176+
If the project fits one or more categories:
177+
{
178+
"categories": [
179+
{ "name": "Source Code Management", "id": "9a66d814-22b8-493d-a3a7-fb2d9e93587c" }
180+
],
181+
"explanation": "Brief explanation of why you chose these categories"
182+
}
142183
143-
`
184+
If the project does not clearly fit any category OR if any mismatch risk exists:
185+
{
186+
"categories": []
187+
}
188+
`
144189

145190
const llmService = new LlmService(
146191
qx,
@@ -156,7 +201,22 @@ export async function findCategoriesWithLLM({
156201
explanation: string
157202
}>(prompt)
158203

204+
// Check if result is null (LLM disabled or error)
205+
if (!result) {
206+
svc.log.warn('LLM service returned null result, skipping categorization')
207+
return { categories: [], explanation: 'LLM service unavailable' }
208+
}
209+
210+
// Validate and correct UUIDs from LLM response
211+
if (Array.isArray(result.categories) && result.categories.length > 0) {
212+
result.categories = validateAndCorrectLLMItems(result.categories, categories, 'Category')
213+
} else if (result.categories && !Array.isArray(result.categories)) {
214+
svc.log.error(`LLM returned categories as non-array: ${typeof result.categories}`)
215+
result.categories = []
216+
}
217+
159218
svc.log.info(`categories found: ${JSON.stringify(result)}`)
219+
160220
return result
161221
}
162222

@@ -261,6 +321,30 @@ export async function findCollectionsWithLLM({
261321
explanation: string
262322
}>(prompt)
263323

324+
// Check if result is null (LLM disabled or error)
325+
if (!result) {
326+
svc.log.warn('LLM service returned null result, skipping collection classification')
327+
return { collections: [], explanation: 'LLM service unavailable' }
328+
}
329+
330+
// Validate and correct UUIDs from LLM response
331+
if (Array.isArray(result.collections) && result.collections.length > 0) {
332+
const validatedCollections = validateAndCorrectLLMItems(
333+
result.collections,
334+
collections,
335+
'Collection',
336+
)
337+
result.collections = validatedCollections
338+
339+
// Log the validated collection IDs for debugging
340+
svc.log.info(
341+
`Validated collections: ${validatedCollections.map((c) => `${c.name}:${c.id}`).join(', ')}`,
342+
)
343+
} else if (result.collections && !Array.isArray(result.collections)) {
344+
svc.log.error(`LLM returned collections as non-array: ${typeof result.collections}`)
345+
result.collections = []
346+
}
347+
264348
svc.log.info(`collections found: ${JSON.stringify(result)}`)
265349

266350
return result
@@ -283,6 +367,11 @@ export async function connectProjectAndCollection(
283367
collectionIds: string[],
284368
insightsProjectId: string,
285369
) {
370+
if (collectionIds.length === 0) {
371+
svc.log.warn(`No collection IDs to connect for project ${insightsProjectId}, skipping`)
372+
return
373+
}
374+
286375
svc.log.info(`updating the collections: ${collectionIds} with the project: ${insightsProjectId}`)
287376
await connectProjectsAndCollections(
288377
dbStoreQx(svc.postgres.writer),

0 commit comments

Comments
 (0)