@@ -207,29 +207,275 @@ export function compressSemanticContent(comment: string, keywords: string[]): st
207207}
208208
209209/**
210- * Pure semantic keyword selection without truncation
210+ * Semantic keyword selection that preserves meaning over compression
211211 */
212212export function selectBestKeywords ( comment : string , maxLength : number , context ?: CommentContext ) : string {
213213 if ( comment . length <= maxLength ) return comment ;
214214
215- // Detect context automatically
216- const enhancedContext = detectFullContext ( comment , context ) ;
215+ // Step 1: Extract meaningful terms with semantic roles
216+ const meaningfulTerms = extractMeaningfulTerms ( comment ) ;
217217
218- // Extract keywords with context awareness
219- const keywords = extractSemanticKeywords ( comment , context ) ;
218+ // Step 2: Preserve semantic core (action + object)
219+ const semanticCore = preserveSemanticCore ( meaningfulTerms ) ;
220220
221- // Prioritize keywords by context relevance
222- const prioritizedKeywords = prioritizeKeywordsByContext ( keywords , enhancedContext . domains ) ;
221+ // Step 3: Enhance with context if space allows
222+ const contextEnhanced = enhanceWithContext ( semanticCore , comment , context ) ;
223223
224- // Apply domain-specific abbreviations
225- const abbreviatedKeywords = applyContextAbbreviations ( prioritizedKeywords , enhancedContext . domains ) ;
224+ // Step 4: Apply selective abbreviations only if needed
225+ const optimized = applySelectiveAbbreviations ( contextEnhanced , maxLength ) ;
226226
227- // Select keywords that fit within length limit
228- return selectKeywordsWithinLimit ( abbreviatedKeywords , maxLength ) ;
227+ // Step 5: Validate semantic quality
228+ const result = validateAndFinalize ( optimized , comment , maxLength ) ;
229+
230+ return result ;
231+ }
232+
233+ /**
234+ * Extract meaningful terms with semantic role classification
235+ */
236+ function extractMeaningfulTerms ( comment : string ) : { actions : string [ ] , objects : string [ ] , descriptors : string [ ] , domains : string [ ] } {
237+ const words = comment . toLowerCase ( )
238+ . replace ( / [ ^ \w \s - ] / g, ' ' )
239+ . split ( / \s + / )
240+ . filter ( word => word . length > 0 ) ;
241+
242+ const meaningfulTerms = {
243+ actions : [ ] as string [ ] ,
244+ objects : [ ] as string [ ] ,
245+ descriptors : [ ] as string [ ] ,
246+ domains : [ ] as string [ ]
247+ } ;
248+
249+ // Action verbs (highest priority)
250+ const actionVerbs = [
251+ 'validates' , 'manages' , 'processes' , 'handles' , 'creates' , 'generates' , 'executes' ,
252+ 'retrieves' , 'stores' , 'updates' , 'deletes' , 'checks' , 'verifies' , 'authenticates' ,
253+ 'authorizes' , 'encrypts' , 'decrypts' , 'compresses' , 'decompresses' , 'parses' ,
254+ 'formats' , 'transforms' , 'converts' , 'filters' , 'sorts' , 'searches' , 'finds' ,
255+ 'loads' , 'saves' , 'sends' , 'receives' , 'connects' , 'disconnects' , 'initializes' ,
256+ 'configures' , 'optimizes' , 'caches' , 'invalidates' , 'refreshes' , 'synchronizes'
257+ ] ;
258+
259+ // Specific objects (high priority)
260+ const objectNouns = [
261+ 'user' , 'users' , 'credentials' , 'password' , 'token' , 'tokens' , 'session' , 'sessions' ,
262+ 'data' , 'record' , 'records' , 'file' , 'files' , 'query' , 'queries' , 'request' , 'requests' ,
263+ 'response' , 'responses' , 'connection' , 'connections' , 'configuration' , 'config' ,
264+ 'settings' , 'options' , 'parameters' , 'metadata' , 'schema' , 'table' , 'database' ,
265+ 'cache' , 'memory' , 'storage' , 'repository' , 'service' , 'api' , 'endpoint' , 'route'
266+ ] ;
267+
268+ // Technical descriptors (medium priority)
269+ const descriptors = [
270+ 'secure' , 'encrypted' , 'cached' , 'optimized' , 'validated' , 'authenticated' ,
271+ 'authorized' , 'compressed' , 'formatted' , 'parsed' , 'filtered' , 'sorted' ,
272+ 'synchronized' , 'asynchronous' , 'concurrent' , 'parallel' , 'distributed' ,
273+ 'scalable' , 'reliable' , 'efficient' , 'fast' , 'slow' , 'large' , 'small'
274+ ] ;
275+
276+ // Domain terms (lowest priority - context only)
277+ const domainTerms = [
278+ 'auth' , 'authentication' , 'database' , 'db' , 'sql' , 'api' , 'http' , 'rest' ,
279+ 'graphql' , 'json' , 'xml' , 'html' , 'css' , 'javascript' , 'typescript' ,
280+ 'python' , 'java' , 'security' , 'encryption' , 'validation'
281+ ] ;
282+
283+ // Classify words by semantic role
284+ for ( const word of words ) {
285+ if ( actionVerbs . includes ( word ) ) {
286+ meaningfulTerms . actions . push ( word ) ;
287+ } else if ( objectNouns . includes ( word ) ) {
288+ meaningfulTerms . objects . push ( word ) ;
289+ } else if ( descriptors . includes ( word ) ) {
290+ meaningfulTerms . descriptors . push ( word ) ;
291+ } else if ( domainTerms . includes ( word ) ) {
292+ meaningfulTerms . domains . push ( word ) ;
293+ }
294+ }
295+
296+ return meaningfulTerms ;
297+ }
298+
299+ /**
300+ * Preserve semantic core (action + object combination)
301+ */
302+ function preserveSemanticCore ( terms : { actions : string [ ] , objects : string [ ] , descriptors : string [ ] , domains : string [ ] } ) : string [ ] {
303+ const core : string [ ] = [ ] ;
304+
305+ // Always include the first action verb (most important)
306+ if ( terms . actions . length > 0 ) {
307+ core . push ( terms . actions [ 0 ] ) ;
308+ }
309+
310+ // Include primary objects (up to 2)
311+ if ( terms . objects . length > 0 ) {
312+ core . push ( ...terms . objects . slice ( 0 , 2 ) ) ;
313+ }
314+
315+ // If no action verb, include descriptors
316+ if ( terms . actions . length === 0 && terms . descriptors . length > 0 ) {
317+ core . push ( terms . descriptors [ 0 ] ) ;
318+ }
319+
320+ return core ;
321+ }
322+
323+ /**
324+ * Enhance with context while preserving core meaning
325+ */
326+ function enhanceWithContext ( core : string [ ] , comment : string , context ?: CommentContext ) : string [ ] {
327+ const enhanced = [ ...core ] ;
328+
329+ // Only add context terms if they provide additional value
330+ const contextTerms = detectDomainContext ( comment ) ;
331+
332+ // Add context term only if it's not redundant with existing terms
333+ for ( const contextTerm of contextTerms ) {
334+ const isRedundant = enhanced . some ( term =>
335+ term . includes ( contextTerm ) || contextTerm . includes ( term )
336+ ) ;
337+
338+ if ( ! isRedundant && enhanced . length < 4 ) {
339+ // Add abbreviated context term if space allows
340+ const abbreviatedContext = getContextAbbreviation ( contextTerm ) ;
341+ if ( abbreviatedContext && abbreviatedContext !== contextTerm ) {
342+ enhanced . push ( abbreviatedContext ) ;
343+ }
344+ }
345+ }
346+
347+ return enhanced ;
348+ }
349+
350+ /**
351+ * Apply selective abbreviations only when necessary
352+ */
353+ function applySelectiveAbbreviations ( terms : string [ ] , maxLength : number ) : string [ ] {
354+ const currentLength = terms . join ( ' ' ) . length ;
355+
356+ if ( currentLength <= maxLength ) {
357+ return terms ; // No abbreviation needed
358+ }
359+
360+ const abbreviated = terms . map ( term => {
361+ // Only abbreviate if it saves significant space and preserves meaning
362+ const abbrev = getSelectiveAbbreviation ( term ) ;
363+ return abbrev || term ;
364+ } ) ;
365+
366+ return abbreviated ;
367+ }
368+
369+ /**
370+ * Get context-appropriate abbreviation
371+ */
372+ function getContextAbbreviation ( contextTerm : string ) : string | null {
373+ const abbreviations : Record < string , string > = {
374+ 'authentication' : 'auth' ,
375+ 'database' : 'db' ,
376+ 'configuration' : 'config' ,
377+ 'repository' : 'repo' ,
378+ 'application' : 'app'
379+ } ;
380+
381+ return abbreviations [ contextTerm ] || null ;
382+ }
383+
384+ /**
385+ * Get selective abbreviation only for long terms
386+ */
387+ function getSelectiveAbbreviation ( term : string ) : string | null {
388+ // Only abbreviate terms longer than 8 characters
389+ if ( term . length <= 8 ) return null ;
390+
391+ const abbreviations : Record < string , string > = {
392+ 'authentication' : 'auth' ,
393+ 'configuration' : 'config' ,
394+ 'repository' : 'repo' ,
395+ 'application' : 'app' ,
396+ 'management' : 'mgmt' ,
397+ 'processing' : 'proc' ,
398+ 'generation' : 'gen' ,
399+ 'initialization' : 'init' ,
400+ 'validation' : 'valid'
401+ } ;
402+
403+ return abbreviations [ term ] || null ;
404+ }
405+
406+ /**
407+ * Validate semantic quality and finalize result
408+ */
409+ function validateAndFinalize ( terms : string [ ] , originalComment : string , maxLength : number ) : string {
410+ const result = terms . join ( ' ' ) ;
411+
412+ // Check if result fits within length limit
413+ if ( result . length > maxLength ) {
414+ // Try removing least important terms
415+ const reduced = reduceToFit ( terms , maxLength ) ;
416+ return reduced ;
417+ }
418+
419+ // Validate semantic quality
420+ if ( ! hasSemanticMeaning ( result , originalComment ) ) {
421+ // Fallback to intelligent truncation
422+ return intelligentTruncation ( originalComment , maxLength ) ;
423+ }
424+
425+ return result ;
426+ }
427+
428+ /**
429+ * Reduce terms to fit within length limit
430+ */
431+ function reduceToFit ( terms : string [ ] , maxLength : number ) : string {
432+ // Remove terms from least to most important
433+ const priorityOrder = [ ...terms ] ;
434+
435+ while ( priorityOrder . length > 1 && priorityOrder . join ( ' ' ) . length > maxLength ) {
436+ // Remove last term (least important)
437+ priorityOrder . pop ( ) ;
438+ }
439+
440+ return priorityOrder . join ( ' ' ) ;
441+ }
442+
443+ /**
444+ * Check if result has semantic meaning
445+ */
446+ function hasSemanticMeaning ( result : string , original : string ) : boolean {
447+ // Must have at least 2 meaningful words
448+ const words = result . split ( ' ' ) . filter ( w => w . length > 2 ) ;
449+ if ( words . length < 2 ) return false ;
450+
451+ // Should contain at least one action or object from original
452+ const originalWords = original . toLowerCase ( ) . split ( / \s + / ) ;
453+ const hasRelevantTerm = words . some ( word =>
454+ originalWords . some ( orig => orig . includes ( word ) || word . includes ( orig ) )
455+ ) ;
456+
457+ return hasRelevantTerm ;
458+ }
459+
460+ /**
461+ * Intelligent truncation fallback
462+ */
463+ function intelligentTruncation ( text : string , maxLength : number ) : string {
464+ if ( text . length <= maxLength ) return text ;
465+
466+ // Find last complete word that fits
467+ const truncated = text . substring ( 0 , maxLength ) ;
468+ const lastSpace = truncated . lastIndexOf ( ' ' ) ;
469+
470+ if ( lastSpace > maxLength * 0.7 ) {
471+ return truncated . substring ( 0 , lastSpace ) ;
472+ }
473+
474+ return truncated ;
229475}
230476
231477/**
232- * Prioritize keywords based on detected context domains
478+ * Prioritize keywords based on detected context domains (LEGACY - keeping for compatibility)
233479 */
234480function prioritizeKeywordsByContext ( keywords : string [ ] , domains : string [ ] ) : string [ ] {
235481 const priorityMap : Record < string , string [ ] > = {
0 commit comments