@@ -13,26 +13,56 @@ import {
1313import { svc } from '../main'
1414import { IFindCategoryParams , IFindCollectionsParams , IListedCategory } from '../types'
1515
16- function formatTextCategoriesForPrompt ( categories : IListedCategory [ ] ) : string {
17- const groupedCategories = new Map < string , string [ ] > ( )
18-
19- for ( const category of categories ) {
20- const groupName = category . categoryGroupName
21- if ( ! groupedCategories . has ( groupName ) ) {
22- groupedCategories . set ( groupName , [ ] )
23- }
24- groupedCategories . get ( groupName ) . push ( category . name + '-' + category . id )
16+ function validateAndCorrectLLMItems < T extends { name : string ; id : string } > (
17+ llmItems : T [ ] ,
18+ databaseItems : T [ ] ,
19+ itemType : string ,
20+ ) : T [ ] {
21+ if ( ! llmItems || llmItems . length === 0 ) {
22+ return [ ]
2523 }
2624
27- let categoriesText = ''
28- for ( const [ groupName , names ] of groupedCategories ) {
29- categoriesText += `## ${ groupName } \n`
30- for ( const name of names ) {
31- categoriesText += `- ${ name } \n`
32- }
33- categoriesText += '\n'
34- }
35- return categoriesText . trim ( )
25+ const validUuidRegex = / ^ [ 0 - 9 a - f ] { 8 } - [ 0 - 9 a - f ] { 4 } - [ 0 - 9 a - f ] { 4 } - [ 0 - 9 a - f ] { 4 } - [ 0 - 9 a - f ] { 12 } $ / i
26+
27+ return llmItems
28+ . map ( ( llmItem ) => {
29+ // Try to find by ID if UUID is valid
30+ if ( validUuidRegex . test ( llmItem . id ) ) {
31+ const dbItem = databaseItems . find ( ( item ) => item . id === llmItem . id )
32+ if ( dbItem ) {
33+ return { name : dbItem . name , id : dbItem . id } as T
34+ }
35+ svc . log . warn ( `${ itemType } UUID "${ llmItem . id } " not found in database, trying name lookup` )
36+ } else {
37+ svc . log . warn (
38+ `${ itemType } has invalid UUID format: "${ llmItem . id } " (length: ${ llmItem . id ?. length || 0 } ), trying name lookup` ,
39+ )
40+ }
41+
42+ // Fallback: try to find by name
43+ const dbItem = llmItem . name
44+ ? databaseItems . find ( ( item ) => item . name . toLowerCase ( ) === llmItem . name . toLowerCase ( ) )
45+ : null
46+
47+ if ( dbItem ) {
48+ svc . log . info ( `Found ${ itemType } "${ llmItem . name } " by name, using DB UUID "${ dbItem . id } "` )
49+ return { name : dbItem . name , id : dbItem . id } as T
50+ }
51+
52+ svc . log . warn ( `${ itemType } "${ llmItem . name } " not found in database, skipping` )
53+ return null
54+ } )
55+ . filter ( Boolean )
56+ }
57+
58+ function formatTextCategoriesForPrompt ( categories : IListedCategory [ ] ) : string {
59+ const categoryObjects = categories . map ( ( category ) => ( {
60+ name : category . name ,
61+ id : category . id ,
62+ categoryGroupName : category . categoryGroupName ,
63+ } ) )
64+
65+ return JSON . stringify ( categoryObjects , null , 2 )
3666}
3767
3868function formatTextCollectionsForPrompt (
@@ -87,60 +117,75 @@ export async function findCategoriesWithLLM({
87117 } )
88118
89119 const prompt = `
120+ You are an expert open-source analyst. Your job is to classify ${ github } into appropriate categories.
90121
91- You are an expert open-source analyst. Your job is to classify ${ github } into appropriate categories.
92-
93- ## Context and Purpose
94- This classification is part of the Open Source Index, a comprehensive catalog of the most critical open-source projects.
95- Developers and organizations use this index to:
96- - Discover relevant open-source tools for their technology stack
97- - Understand the open-source ecosystem in their domain
98- - Make informed decisions about which projects to adopt or contribute to
99- - Assess the health and importance of projects in specific technology areas
122+ ## Context and Purpose
123+ This classification is part of the Open Source Index, a comprehensive catalog of the most critical open-source projects.
124+ Developers and organizations use this index to:
125+ - Discover relevant open-source tools for their technology stack
126+ - Understand the open-source ecosystem in their domain
127+ - Make informed decisions about which projects to adopt or contribute to
128+ - Assess the health and importance of projects in specific technology areas
100129
101- Accurate categorization is essential for users to find the right projects when browsing by technology domain or industry vertical.
130+ Accurate categorization is essential for users to find the right projects when browsing by technology domain or industry vertical.
102131
103- ## Project Information
104- - URL: ${ github }
105- - Description: ${ description }
106- - Topics: ${ topics }
107- - Homepage: ${ website }
132+ ## Project Information
133+ - URL: ${ github }
134+ - Description: ${ description }
135+ - Topics: ${ topics }
136+ - Homepage: ${ website }
108137
109- ## Available Categories
110- These categories are organized by category groups and each category is shown as "CategoryName-CategoryID":
138+ ## Available Categories (AUTHORITATIVE, CLOSED SET)
139+ The following categories are the ONLY valid options.
140+ They are provided as a JSON array of immutable objects.
141+ Every valid category is exactly one object in this array:
111142
112- ${ formatTextCategoriesForPrompt ( categories ) }
143+ ${ formatTextCategoriesForPrompt ( categories ) }
113144
114- ## Your Task
115- Analyze the project and determine which categories it belongs to. A project can belong to multiple categories if appropriate.
145+ ## NON-NEGOTIABLE OUTPUT CONSTRAINTS (MUST FOLLOW)
146+ - You MUST select categories ONLY from the JSON array above.
147+ - You MUST NOT invent categories.
148+ - You MUST NOT generate new ids.
149+ - You MUST NOT retype, rephrase, normalize, translate, or modify ANY character
150+ of any selected category object's "name" or "id".
151+ - The output "categories" MUST be a subset of objects copied EXACTLY from the array above.
152+ - If you cannot comply perfectly, return {"categories": []}.
116153
117- Consider:
118- - The project's primary functionality and purpose
119- - The technology domain it operates in
120- - The industry or vertical it serves (if applicable)
121- - How developers would expect to find this project when browsing by category
154+ ### MANDATORY SELF-CHECK BEFORE FINAL OUTPUT
155+ For each object you plan to output in "categories":
156+ 1) Confirm there is an IDENTICAL object in the provided JSON array (same "name" string, same "id" string).
157+ 2) If not identical, REMOVE it (do not replace it).
122158
123- If the project doesn't clearly fit into any of the available categories, return an empty array for categories.
159+ ## Your Task
160+ Analyze the project and determine which categories it belongs to.
161+ A project can belong to multiple categories if appropriate.
124162
125- ## Format
126- Respond with a valid JSON object **only**. Do not include any explanations, markdown formatting, or extra text.
163+ Consider:
164+ - The project's primary functionality and purpose
165+ - The technology domain it operates in
166+ - The industry or vertical it serves (if applicable)
167+ - How developers would expect to find this project when browsing by category
127168
128- If the project fits one or more categories:
129- {
130- "categories": [
131- { "name": "CategoryName", "id": "CategoryID" },
132- { "name": "AnotherCategory", "id": "AnotherID" }
133- ],
134- "explanation": "Brief explanation of why you chose these categories"
135- }
169+ If the project doesn't clearly fit into any of the available categories, return an empty array for categories.
136170
137- If the project does not clearly fit any category:
138- {
139- "categories": []
140- }
171+ ## Format
172+ Respond with a valid JSON object ONLY.
173+ Do not include explanations outside the JSON.
174+ Do not include markdown formatting or extra text.
141175
176+ If the project fits one or more categories:
177+ {
178+ "categories": [
179+ { "name": "Source Code Management", "id": "9a66d814-22b8-493d-a3a7-fb2d9e93587c" }
180+ ],
181+ "explanation": "Brief explanation of why you chose these categories"
182+ }
142183
143- `
184+ If the project does not clearly fit any category OR if any mismatch risk exists:
185+ {
186+ "categories": []
187+ }
188+ `
144189
145190 const llmService = new LlmService (
146191 qx ,
@@ -156,7 +201,22 @@ export async function findCategoriesWithLLM({
156201 explanation : string
157202 } > ( prompt )
158203
204+ // Check if result is null (LLM disabled or error)
205+ if ( ! result ) {
206+ svc . log . warn ( 'LLM service returned null result, skipping categorization' )
207+ return { categories : [ ] , explanation : 'LLM service unavailable' }
208+ }
209+
210+ // Validate and correct UUIDs from LLM response
211+ if ( Array . isArray ( result . categories ) && result . categories . length > 0 ) {
212+ result . categories = validateAndCorrectLLMItems ( result . categories , categories , 'Category' )
213+ } else if ( result . categories && ! Array . isArray ( result . categories ) ) {
214+ svc . log . error ( `LLM returned categories as non-array: ${ typeof result . categories } ` )
215+ result . categories = [ ]
216+ }
217+
159218 svc . log . info ( `categories found: ${ JSON . stringify ( result ) } ` )
219+
160220 return result
161221}
162222
@@ -261,6 +321,30 @@ export async function findCollectionsWithLLM({
261321 explanation : string
262322 } > ( prompt )
263323
324+ // Check if result is null (LLM disabled or error)
325+ if ( ! result ) {
326+ svc . log . warn ( 'LLM service returned null result, skipping collection classification' )
327+ return { collections : [ ] , explanation : 'LLM service unavailable' }
328+ }
329+
330+ // Validate and correct UUIDs from LLM response
331+ if ( Array . isArray ( result . collections ) && result . collections . length > 0 ) {
332+ const validatedCollections = validateAndCorrectLLMItems (
333+ result . collections ,
334+ collections ,
335+ 'Collection' ,
336+ )
337+ result . collections = validatedCollections
338+
339+ // Log the validated collection IDs for debugging
340+ svc . log . info (
341+ `Validated collections: ${ validatedCollections . map ( ( c ) => `${ c . name } :${ c . id } ` ) . join ( ', ' ) } ` ,
342+ )
343+ } else if ( result . collections && ! Array . isArray ( result . collections ) ) {
344+ svc . log . error ( `LLM returned collections as non-array: ${ typeof result . collections } ` )
345+ result . collections = [ ]
346+ }
347+
264348 svc . log . info ( `collections found: ${ JSON . stringify ( result ) } ` )
265349
266350 return result
@@ -283,6 +367,11 @@ export async function connectProjectAndCollection(
283367 collectionIds : string [ ] ,
284368 insightsProjectId : string ,
285369) {
370+ if ( collectionIds . length === 0 ) {
371+ svc . log . warn ( `No collection IDs to connect for project ${ insightsProjectId } , skipping` )
372+ return
373+ }
374+
286375 svc . log . info ( `updating the collections: ${ collectionIds } with the project: ${ insightsProjectId } ` )
287376 await connectProjectsAndCollections (
288377 dbStoreQx ( svc . postgres . writer ) ,
0 commit comments