Skip to content

Commit f540928

Browse files
authored
Merge pull request #30 from freshtechbro/fix/semantic-extraction-meaning-preservation
fix(semantic-extraction): redesign algorithm to preserve meaning over compression
2 parents 3e63c3b + ee57156 commit f540928

2 files changed

Lines changed: 318 additions & 13 deletions

File tree

src/tools/code-map-generator/__tests__/commentProcessor.test.ts

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,12 +174,71 @@ describe('CommentProcessor', () => {
174174

175175
expect(result.length).toBeLessThanOrEqual(25);
176176
// Should preserve meaningful keywords without truncation
177-
expect(result.toLowerCase()).toMatch(/auth|user|validation|database/);
177+
expect(result.toLowerCase()).toMatch(/processes|validates|user|credentials/);
178178
// Should NOT contain truncation indicator
179179
expect(result).not.toContain('...');
180180
// Should be composed of meaningful words, not truncated text
181181
expect(result.split(' ').every(word => word.length > 0)).toBe(true);
182182
});
183+
184+
it('should preserve action verbs in semantic extraction', () => {
185+
const comment = 'Validates user credentials against the database';
186+
const result = processor.processComment(comment);
187+
188+
expect(result.length).toBeLessThanOrEqual(25);
189+
expect(result.toLowerCase()).toContain('validates');
190+
expect(result.toLowerCase()).toMatch(/user|credentials/);
191+
});
192+
193+
it('should handle authentication context meaningfully', () => {
194+
const comment = 'Manages user sessions and token validation processes';
195+
const result = processor.processComment(comment);
196+
197+
expect(result.length).toBeLessThanOrEqual(25);
198+
expect(result.toLowerCase()).toContain('manages');
199+
expect(result.toLowerCase()).toMatch(/user|sessions|token/);
200+
expect(result).not.toContain('...');
201+
});
202+
203+
it('should handle database context meaningfully', () => {
204+
const comment = 'Executes SQL queries to retrieve user records from database';
205+
const result = processor.processComment(comment);
206+
207+
expect(result.length).toBeLessThanOrEqual(25);
208+
expect(result.toLowerCase()).toContain('executes');
209+
expect(result.toLowerCase()).toMatch(/sql|queries|user|records/);
210+
expect(result).not.toContain('...');
211+
});
212+
213+
it('should preserve semantic meaning over compression', () => {
214+
const comment = 'Stores encrypted user data in secure database storage';
215+
const result = processor.processComment(comment);
216+
217+
expect(result.length).toBeLessThanOrEqual(25);
218+
expect(result.toLowerCase()).toContain('stores');
219+
expect(result.toLowerCase()).toMatch(/encrypted|user|data/);
220+
// Should not become generic like "auth config"
221+
expect(result.toLowerCase()).not.toMatch(/^(auth|db|api)\s+(config|integration)$/);
222+
});
223+
224+
it('should handle file operations meaningfully', () => {
225+
const comment = 'Handles file upload operations with comprehensive validation and security checks';
226+
const result = processor.processComment(comment);
227+
228+
expect(result.length).toBeLessThanOrEqual(25);
229+
expect(result.toLowerCase()).toContain('handles');
230+
expect(result.toLowerCase()).toMatch(/file|upload/);
231+
expect(result).not.toContain('...');
232+
});
233+
234+
it('should fallback gracefully for unclear comments', () => {
235+
const comment = 'This is a very generic comment without specific technical terms or actions';
236+
const result = processor.processComment(comment);
237+
238+
expect(result.length).toBeLessThanOrEqual(25);
239+
expect(result.length).toBeGreaterThan(0);
240+
expect(result).not.toContain('...');
241+
});
183242
});
184243

185244
describe('processComments', () => {

src/tools/code-map-generator/utils/semanticExtractor.ts

Lines changed: 258 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -207,29 +207,275 @@ export function compressSemanticContent(comment: string, keywords: string[]): st
207207
}
208208

209209
/**
210-
* Pure semantic keyword selection without truncation
210+
* Semantic keyword selection that preserves meaning over compression
211211
*/
212212
export function selectBestKeywords(comment: string, maxLength: number, context?: CommentContext): string {
213213
if (comment.length <= maxLength) return comment;
214214

215-
// Detect context automatically
216-
const enhancedContext = detectFullContext(comment, context);
215+
// Step 1: Extract meaningful terms with semantic roles
216+
const meaningfulTerms = extractMeaningfulTerms(comment);
217217

218-
// Extract keywords with context awareness
219-
const keywords = extractSemanticKeywords(comment, context);
218+
// Step 2: Preserve semantic core (action + object)
219+
const semanticCore = preserveSemanticCore(meaningfulTerms);
220220

221-
// Prioritize keywords by context relevance
222-
const prioritizedKeywords = prioritizeKeywordsByContext(keywords, enhancedContext.domains);
221+
// Step 3: Enhance with context if space allows
222+
const contextEnhanced = enhanceWithContext(semanticCore, comment, context);
223223

224-
// Apply domain-specific abbreviations
225-
const abbreviatedKeywords = applyContextAbbreviations(prioritizedKeywords, enhancedContext.domains);
224+
// Step 4: Apply selective abbreviations only if needed
225+
const optimized = applySelectiveAbbreviations(contextEnhanced, maxLength);
226226

227-
// Select keywords that fit within length limit
228-
return selectKeywordsWithinLimit(abbreviatedKeywords, maxLength);
227+
// Step 5: Validate semantic quality
228+
const result = validateAndFinalize(optimized, comment, maxLength);
229+
230+
return result;
231+
}
232+
233+
/**
234+
* Extract meaningful terms with semantic role classification
235+
*/
236+
function extractMeaningfulTerms(comment: string): { actions: string[], objects: string[], descriptors: string[], domains: string[] } {
237+
const words = comment.toLowerCase()
238+
.replace(/[^\w\s-]/g, ' ')
239+
.split(/\s+/)
240+
.filter(word => word.length > 0);
241+
242+
const meaningfulTerms = {
243+
actions: [] as string[],
244+
objects: [] as string[],
245+
descriptors: [] as string[],
246+
domains: [] as string[]
247+
};
248+
249+
// Action verbs (highest priority)
250+
const actionVerbs = [
251+
'validates', 'manages', 'processes', 'handles', 'creates', 'generates', 'executes',
252+
'retrieves', 'stores', 'updates', 'deletes', 'checks', 'verifies', 'authenticates',
253+
'authorizes', 'encrypts', 'decrypts', 'compresses', 'decompresses', 'parses',
254+
'formats', 'transforms', 'converts', 'filters', 'sorts', 'searches', 'finds',
255+
'loads', 'saves', 'sends', 'receives', 'connects', 'disconnects', 'initializes',
256+
'configures', 'optimizes', 'caches', 'invalidates', 'refreshes', 'synchronizes'
257+
];
258+
259+
// Specific objects (high priority)
260+
const objectNouns = [
261+
'user', 'users', 'credentials', 'password', 'token', 'tokens', 'session', 'sessions',
262+
'data', 'record', 'records', 'file', 'files', 'query', 'queries', 'request', 'requests',
263+
'response', 'responses', 'connection', 'connections', 'configuration', 'config',
264+
'settings', 'options', 'parameters', 'metadata', 'schema', 'table', 'database',
265+
'cache', 'memory', 'storage', 'repository', 'service', 'api', 'endpoint', 'route'
266+
];
267+
268+
// Technical descriptors (medium priority)
269+
const descriptors = [
270+
'secure', 'encrypted', 'cached', 'optimized', 'validated', 'authenticated',
271+
'authorized', 'compressed', 'formatted', 'parsed', 'filtered', 'sorted',
272+
'synchronized', 'asynchronous', 'concurrent', 'parallel', 'distributed',
273+
'scalable', 'reliable', 'efficient', 'fast', 'slow', 'large', 'small'
274+
];
275+
276+
// Domain terms (lowest priority - context only)
277+
const domainTerms = [
278+
'auth', 'authentication', 'database', 'db', 'sql', 'api', 'http', 'rest',
279+
'graphql', 'json', 'xml', 'html', 'css', 'javascript', 'typescript',
280+
'python', 'java', 'security', 'encryption', 'validation'
281+
];
282+
283+
// Classify words by semantic role
284+
for (const word of words) {
285+
if (actionVerbs.includes(word)) {
286+
meaningfulTerms.actions.push(word);
287+
} else if (objectNouns.includes(word)) {
288+
meaningfulTerms.objects.push(word);
289+
} else if (descriptors.includes(word)) {
290+
meaningfulTerms.descriptors.push(word);
291+
} else if (domainTerms.includes(word)) {
292+
meaningfulTerms.domains.push(word);
293+
}
294+
}
295+
296+
return meaningfulTerms;
297+
}
298+
299+
/**
300+
* Preserve semantic core (action + object combination)
301+
*/
302+
function preserveSemanticCore(terms: { actions: string[], objects: string[], descriptors: string[], domains: string[] }): string[] {
303+
const core: string[] = [];
304+
305+
// Always include the first action verb (most important)
306+
if (terms.actions.length > 0) {
307+
core.push(terms.actions[0]);
308+
}
309+
310+
// Include primary objects (up to 2)
311+
if (terms.objects.length > 0) {
312+
core.push(...terms.objects.slice(0, 2));
313+
}
314+
315+
// If no action verb, include descriptors
316+
if (terms.actions.length === 0 && terms.descriptors.length > 0) {
317+
core.push(terms.descriptors[0]);
318+
}
319+
320+
return core;
321+
}
322+
323+
/**
324+
* Enhance with context while preserving core meaning
325+
*/
326+
function enhanceWithContext(core: string[], comment: string, context?: CommentContext): string[] {
327+
const enhanced = [...core];
328+
329+
// Only add context terms if they provide additional value
330+
const contextTerms = detectDomainContext(comment);
331+
332+
// Add context term only if it's not redundant with existing terms
333+
for (const contextTerm of contextTerms) {
334+
const isRedundant = enhanced.some(term =>
335+
term.includes(contextTerm) || contextTerm.includes(term)
336+
);
337+
338+
if (!isRedundant && enhanced.length < 4) {
339+
// Add abbreviated context term if space allows
340+
const abbreviatedContext = getContextAbbreviation(contextTerm);
341+
if (abbreviatedContext && abbreviatedContext !== contextTerm) {
342+
enhanced.push(abbreviatedContext);
343+
}
344+
}
345+
}
346+
347+
return enhanced;
348+
}
349+
350+
/**
351+
* Apply selective abbreviations only when necessary
352+
*/
353+
function applySelectiveAbbreviations(terms: string[], maxLength: number): string[] {
354+
const currentLength = terms.join(' ').length;
355+
356+
if (currentLength <= maxLength) {
357+
return terms; // No abbreviation needed
358+
}
359+
360+
const abbreviated = terms.map(term => {
361+
// Only abbreviate if it saves significant space and preserves meaning
362+
const abbrev = getSelectiveAbbreviation(term);
363+
return abbrev || term;
364+
});
365+
366+
return abbreviated;
367+
}
368+
369+
/**
370+
* Get context-appropriate abbreviation
371+
*/
372+
function getContextAbbreviation(contextTerm: string): string | null {
373+
const abbreviations: Record<string, string> = {
374+
'authentication': 'auth',
375+
'database': 'db',
376+
'configuration': 'config',
377+
'repository': 'repo',
378+
'application': 'app'
379+
};
380+
381+
return abbreviations[contextTerm] || null;
382+
}
383+
384+
/**
385+
* Get selective abbreviation only for long terms
386+
*/
387+
function getSelectiveAbbreviation(term: string): string | null {
388+
// Only abbreviate terms longer than 8 characters
389+
if (term.length <= 8) return null;
390+
391+
const abbreviations: Record<string, string> = {
392+
'authentication': 'auth',
393+
'configuration': 'config',
394+
'repository': 'repo',
395+
'application': 'app',
396+
'management': 'mgmt',
397+
'processing': 'proc',
398+
'generation': 'gen',
399+
'initialization': 'init',
400+
'validation': 'valid'
401+
};
402+
403+
return abbreviations[term] || null;
404+
}
405+
406+
/**
407+
* Validate semantic quality and finalize result
408+
*/
409+
function validateAndFinalize(terms: string[], originalComment: string, maxLength: number): string {
410+
const result = terms.join(' ');
411+
412+
// Check if result fits within length limit
413+
if (result.length > maxLength) {
414+
// Try removing least important terms
415+
const reduced = reduceToFit(terms, maxLength);
416+
return reduced;
417+
}
418+
419+
// Validate semantic quality
420+
if (!hasSemanticMeaning(result, originalComment)) {
421+
// Fallback to intelligent truncation
422+
return intelligentTruncation(originalComment, maxLength);
423+
}
424+
425+
return result;
426+
}
427+
428+
/**
429+
* Reduce terms to fit within length limit
430+
*/
431+
function reduceToFit(terms: string[], maxLength: number): string {
432+
// Remove terms from least to most important
433+
const priorityOrder = [...terms];
434+
435+
while (priorityOrder.length > 1 && priorityOrder.join(' ').length > maxLength) {
436+
// Remove last term (least important)
437+
priorityOrder.pop();
438+
}
439+
440+
return priorityOrder.join(' ');
441+
}
442+
443+
/**
444+
* Check if result has semantic meaning
445+
*/
446+
function hasSemanticMeaning(result: string, original: string): boolean {
447+
// Must have at least 2 meaningful words
448+
const words = result.split(' ').filter(w => w.length > 2);
449+
if (words.length < 2) return false;
450+
451+
// Should contain at least one action or object from original
452+
const originalWords = original.toLowerCase().split(/\s+/);
453+
const hasRelevantTerm = words.some(word =>
454+
originalWords.some(orig => orig.includes(word) || word.includes(orig))
455+
);
456+
457+
return hasRelevantTerm;
458+
}
459+
460+
/**
461+
* Intelligent truncation fallback
462+
*/
463+
function intelligentTruncation(text: string, maxLength: number): string {
464+
if (text.length <= maxLength) return text;
465+
466+
// Find last complete word that fits
467+
const truncated = text.substring(0, maxLength);
468+
const lastSpace = truncated.lastIndexOf(' ');
469+
470+
if (lastSpace > maxLength * 0.7) {
471+
return truncated.substring(0, lastSpace);
472+
}
473+
474+
return truncated;
229475
}
230476

231477
/**
232-
* Prioritize keywords based on detected context domains
478+
* Prioritize keywords based on detected context domains (LEGACY - keeping for compatibility)
233479
*/
234480
function prioritizeKeywordsByContext(keywords: string[], domains: string[]): string[] {
235481
const priorityMap: Record<string, string[]> = {

0 commit comments

Comments
 (0)