llm-reasoning-framework/test_my_problem.py at main · matdev83/llm-reasoning-framework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
#!/usr/bin/env python3
"""
Test the hybrid thinking model with your own problems.

Usage:
    python test_my_problem.py "Your problem here"

Requirements:
    - Set OPENROUTER_API_KEY environment variable
    - Activate virtual environment (.venv\\Scripts\\activate)
"""

import sys
import os
import logging
import asyncio
from pathlib import Path

# Add src to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))

from hybrid.processor import HybridProcessor
from hybrid.dataclasses import HybridConfig, ReasoningConfig
from llm_client import LLMClient
from hybrid.orchestrator import HybridOrchestrator
from llm_config import LLMConfig

# Set up logging to see what's happening
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s [%(name)s] %(message)s')

def test_hybrid_thinking_debug(problem_text, api_key=None):
    """Test the hybrid thinking model with detailed debug output"""

    if not api_key:
        api_key = os.getenv('OPENROUTER_API_KEY')
        if not api_key:
            print("❌ Error: OPENROUTER_API_KEY environment variable not set")
            print("   Get one from https://openrouter.ai/")
            return False

    # Configure the hybrid thinking model with new OpenRouter reasoning support
    # Note: DeepSeek-R1 models don't support effort levels, only basic reasoning
    reasoning_config = ReasoningConfig(
        enabled=True,
        effort="medium",  # This will be filtered out for DeepSeek models
        exclude=False     # Include reasoning tokens in response
    )

    config = HybridConfig(
        reasoning_model_name="deepseek/deepseek-r1-0528:free",  # Free reasoning model
        response_model_name="openrouter/cypher-alpha:free",     # Free response model
        reasoning_model_temperature=0.1,
        response_model_temperature=0.3,
        reasoning_complete_token="<REASONING_COMPLETE>",
        reasoning_prompt_template="Problem: {problem_description}\n\nThink step-by-step to solve this problem. When you finish your reasoning, output exactly: {reasoning_complete_token}\n\nReasoning:",
        response_prompt_template="""<problem>
{problem_description}
</problem>

<reasoning>
<extracted_thoughts>
{extracted_reasoning}
</extracted_thoughts>
</reasoning>

<instructions>
Based on the problem and the reasoning provided above, provide a clear final answer.
</instructions>""",
        max_reasoning_tokens=800,
        max_response_tokens=400,
        reasoning_config=reasoning_config,
        use_streaming=True,  # Enable streaming for token optimization
        model_specific_headers={
            "deepseek": {
                "HTTP-Referer": "https://your-app.com",
                "X-Title": "Hybrid Reasoning Test"
            }
        }
    )

    # Show effective reasoning config for the model
    effective_config = config.get_effective_reasoning_config(config.reasoning_model_name)
    print(f"🔧 Model: {config.reasoning_model_name}")
    print(f"📋 Effective reasoning config: {effective_config.to_openrouter_dict() if effective_config else 'None (no reasoning support)'}")
    print()

    # Create LLM client and processor
    llm_client = LLMClient(api_key=api_key)
    processor = HybridProcessor(llm_client=llm_client, config=config)

    print(f"🤔 Testing problem: {problem_text}")
    print("🔄 Running hybrid thinking model with OpenRouter reasoning...")
    print()

    # Run the hybrid thinking process
    result = processor.run(problem_text)

    # Display results
    if result.succeeded:
        print("✅ SUCCESS!")
        print()
        print("🧠 REASONING EXTRACTED:")
        print("-" * 50)
        print(repr(result.extracted_reasoning))  # Use repr to see whitespace/empty strings
        print("-" * 50)
        print(result.extracted_reasoning)
        print()
        print("💡 FINAL ANSWER:")
        print("-" * 50)
        print(result.final_answer)
        print()
        print("📊 STATS:")
        print(f"   Reasoning tokens: {result.reasoning_call_stats.completion_tokens}")
        print(f"   Response tokens: {result.response_call_stats.completion_tokens}")
        print(f"   Total time: {result.reasoning_call_stats.call_duration_seconds + result.response_call_stats.call_duration_seconds:.2f}s")
        print(f"   Detected format: {result.detected_reasoning_format}")
        print(f"   Reasoning length: {len(result.extracted_reasoning)} characters")
        print(f"   Full reasoning preserved: {'✅' if len(result.extracted_reasoning) > 200 else '⚠️  (may be truncated by model)'}")
        return True
    else:
        print("❌ FAILED!")
        print(f"Error: {result.error_message}")
        return False

def test_hybrid_thinking(problem_text, api_key=None):
    """Test the hybrid thinking model with default reasoning configuration"""

    if not api_key:
        api_key = os.getenv('OPENROUTER_API_KEY')
        if not api_key:
            print("❌ Error: OPENROUTER_API_KEY environment variable not set")
            print("   Get one from https://openrouter.ai/")
            return False

    # Configure the hybrid thinking model - no explicit reasoning config
    # This will use model-specific defaults automatically
    config = HybridConfig(
        reasoning_model_name="deepseek/deepseek-r1-0528:free",  # Free reasoning model
        response_model_name="openrouter/cypher-alpha:free",     # Free response model
        reasoning_model_temperature=0.1,
        response_model_temperature=0.3,
        reasoning_complete_token="<REASONING_COMPLETE>",
        reasoning_prompt_template="Problem: {problem_description}\n\nThink step-by-step to solve this problem. When you finish your reasoning, output exactly: {reasoning_complete_token}\n\nReasoning:",
        response_prompt_template="""<problem>
{problem_description}
</problem>

<reasoning>
<extracted_thoughts>
{extracted_reasoning}
</extracted_thoughts>
</reasoning>

<instructions>
Based on the problem and the reasoning provided above, provide a clear final answer.
</instructions>""",
        max_reasoning_tokens=800,
        max_response_tokens=400,
        reasoning_config=None,  # Use model-specific defaults
        use_streaming=True,
        model_specific_headers={
            "deepseek": {
                "HTTP-Referer": "https://your-app.com",
                "X-Title": "Hybrid Reasoning Test"
            }
        }
    )

    # Show the effective reasoning config (should be model defaults)
    effective_config = config.get_effective_reasoning_config(config.reasoning_model_name)
    print(f"🔧 Model: {config.reasoning_model_name}")
    print(f"📋 Using default reasoning config: {effective_config.to_openrouter_dict() if effective_config else 'None'}")
    print()

    # Create LLM client and processor
    llm_client = LLMClient(api_key=api_key)
    processor = HybridProcessor(llm_client=llm_client, config=config)

    print(f"🤔 Testing problem: {problem_text}")
    print("🔄 Running hybrid thinking model with default configuration...")
    print()

    # Run the hybrid thinking process
    result = processor.run(problem_text)

    # Display results
    if result.succeeded:
        print("✅ SUCCESS!")
        print()
        print("🧠 REASONING EXTRACTED:")
        print("-" * 50)
        print(result.extracted_reasoning)  # Show full reasoning output without truncation
        print()
        print("💡 FINAL ANSWER:")
        print("-" * 50)
        print(result.final_answer)
        print()
        print("📊 STATS:")
        print(f"   Reasoning tokens: {result.reasoning_call_stats.completion_tokens}")
        print(f"   Response tokens: {result.response_call_stats.completion_tokens}")
        print(f"   Total time: {result.reasoning_call_stats.call_duration_seconds + result.response_call_stats.call_duration_seconds:.2f}s")
        print(f"   Detected format: {result.detected_reasoning_format}")
        print(f"   Reasoning length: {len(result.extracted_reasoning)} characters")
        print(f"   Full reasoning preserved: {'✅' if len(result.extracted_reasoning) > 200 else '⚠️  (may be truncated by model)'}")
        return True
    else:
        print("❌ FAILED!")
        print(f"Error: {result.error_message}")
        return False

def test_model_default_reasoning_configs():
    """Test model-specific default reasoning configurations"""

    print("🎯 Testing model-specific default reasoning configurations:")
    print("=" * 70)

    # Test models with their expected defaults
    test_models = [
        {
            "name": "OpenAI o3-mini",
            "model": "openai/o3-mini",
            "expected_default": {"enabled": True, "effort": "high", "exclude": False}
        },
        {
            "name": "Grok Beta",
            "model": "grok/grok-beta",
            "expected_default": {"enabled": True, "effort": "high", "exclude": False}
        },
        {
            "name": "Gemini Thinking",
            "model": "google/gemini-2.5-flash-preview:thinking",
            "expected_default": {"enabled": True, "max_tokens": 8000, "exclude": False}
        },
        {
            "name": "Anthropic Claude",
            "model": "anthropic/claude-3.7-sonnet",
            "expected_default": {"enabled": True, "max_tokens": 4000, "exclude": False}
        },
        {
            "name": "DeepSeek-R1",
            "model": "deepseek/deepseek-r1-0528:free",
            "expected_default": {"enabled": True, "exclude": False}
        },
        {
            "name": "Non-reasoning model",
            "model": "openrouter/cypher-alpha:free",
            "expected_default": None
        }
    ]

    for test_model in test_models:
        # Create config without reasoning_config to test defaults
        config = HybridConfig(
            reasoning_model_name=test_model["model"],
            response_model_name="openrouter/cypher-alpha:free",
            reasoning_config=None  # No explicit config - should use defaults
        )

        # Get the default config
        default_config = config.get_model_default_reasoning_config(test_model["model"])
        effective_config = config.get_effective_reasoning_config(test_model["model"])

        print(f"\n🤖 {test_model['name']}")
        print(f"   Model: {test_model['model']}")
        print(f"   Default config: {default_config.to_openrouter_dict() if default_config else 'None'}")
        print(f"   Effective config: {effective_config.to_openrouter_dict() if effective_config else 'None'}")

        # Verify the defaults match expectations
        expected = test_model["expected_default"]
        if effective_config:
            actual = effective_config.to_openrouter_dict()
            if actual == expected:
                print("   ✅ Default configuration is correct")
            else:
                print(f"   ❌ Unexpected default: expected {expected}, got {actual}")
        elif expected is None:
            print("   ✅ Correctly identified as non-reasoning model")
        else:
            print(f"   ❌ Expected default config but got None")

    print("\n" + "=" * 70)
    print("📋 Default Configuration Summary:")
    print("   • OpenAI/Grok models: effort='high' (best reasoning quality)")
    print("   • Gemini Thinking: max_tokens=8,000 (within output limits)")
    print("   • Anthropic Claude: max_tokens=4,000 (within output limits)")
    print("   • DeepSeek-R1: enabled=True (basic reasoning only)")
    print("   • Other models: No reasoning support")

def test_different_reasoning_models():
    """Test different reasoning models to show model-specific configuration"""

    api_key = os.getenv('OPENROUTER_API_KEY')
    if not api_key:
        print("❌ Error: OPENROUTER_API_KEY environment variable not set")
        return

    # Test configurations for different model types
    test_configs = [
        {
            "name": "DeepSeek-R1 (Basic reasoning only)",
            "model": "deepseek/deepseek-r1-0528:free",
            "reasoning_config": ReasoningConfig(enabled=True, effort="high", exclude=False),
            "expected_effective": {"enabled": True, "exclude": False}  # effort will be filtered out
        },
        {
            "name": "OpenAI o3-mini (Effort levels supported)",
            "model": "openai/o3-mini",
            "reasoning_config": ReasoningConfig(enabled=True, effort="high", exclude=False),
            "expected_effective": {"enabled": True, "effort": "high", "exclude": False}
        },
        {
            "name": "Anthropic Claude (Max tokens supported)",
            "model": "anthropic/claude-3.7-sonnet",
            "reasoning_config": ReasoningConfig(enabled=True, max_tokens=2000, exclude=False),
            "expected_effective": {"enabled": True, "max_tokens": 2000, "exclude": False}
        },
        {
            "name": "Non-reasoning model",
            "model": "openrouter/cypher-alpha:free",
            "reasoning_config": ReasoningConfig(enabled=True, effort="high", exclude=False),
            "expected_effective": None  # No reasoning support
        }
    ]

    print("🧪 Testing model-specific reasoning configuration filtering:")
    print("=" * 70)

    for test_config in test_configs:
        config = HybridConfig(
            reasoning_model_name=test_config["model"],
            response_model_name="openrouter/cypher-alpha:free",
            reasoning_config=test_config["reasoning_config"]
        )

        effective_config = config.get_effective_reasoning_config(test_config["model"])

        print(f"\n📱 {test_config['name']}")
        print(f"   Model: {test_config['model']}")
        print(f"   Original config: {test_config['reasoning_config'].to_openrouter_dict()}")
        print(f"   Effective config: {effective_config.to_openrouter_dict() if effective_config else 'None'}")

        # Verify the filtering worked as expected
        if effective_config:
            actual = effective_config.to_openrouter_dict()
            expected = test_config["expected_effective"]
            if actual == expected:
                print("   ✅ Filtering worked correctly")
            else:
                print(f"   ❌ Unexpected filtering: expected {expected}, got {actual}")
        elif test_config["expected_effective"] is None:
            print("   ✅ Correctly identified as non-reasoning model")
        else:
            print("   ❌ Expected reasoning support but got None")

    print("\n" + "=" * 70)

def test_token_limits():
    """Test model-specific token limits"""
    print("\n=== Testing Model-Specific Token Limits ===")

    # Test Gemini Thinking model
    gemini_config = HybridConfig(
        reasoning_model_name="google/gemini-2.5-flash-preview:thinking",
        response_model_name="google/gemini-2.5-flash-preview"
    )

    gemini_limits = gemini_config.get_effective_token_limits()
    print(f"Gemini Thinking limits: {gemini_limits}")

    # Test Claude model
    claude_config = HybridConfig(
        reasoning_model_name="anthropic/claude-3.5-sonnet-20241022",
        response_model_name="anthropic/claude-3.5-sonnet-20241022"
    )

    claude_limits = claude_config.get_effective_token_limits()
    print(f"Claude limits: {claude_limits}")

    # Test DeepSeek model
    deepseek_config = HybridConfig(
        reasoning_model_name="deepseek/deepseek-r1:nitro",
        response_model_name="deepseek/deepseek-r1:nitro"
    )

    deepseek_limits = deepseek_config.get_effective_token_limits()
    print(f"DeepSeek limits: {deepseek_limits}")

    # Test OpenAI model
    openai_config = HybridConfig(
        reasoning_model_name="openai/o1-preview",
        response_model_name="openai/gpt-4o"
    )

    openai_limits = openai_config.get_effective_token_limits()
    print(f"OpenAI limits: {openai_limits}")

    # Verify Gemini gets higher allocations
    assert gemini_limits["max_reasoning_tokens"] > claude_limits["max_reasoning_tokens"], \
        "Gemini should get higher reasoning token allocation"
    assert gemini_limits["max_response_tokens"] > claude_limits["max_response_tokens"], \
        "Gemini should get higher response token allocation"

    print("✅ All token limit tests passed!")

def test_reasoning_with_gemini():
    """Test reasoning extraction with Gemini model and increased token limits"""
    print("\n=== Testing Gemini Reasoning with Increased Token Limits ===")

    # Create configuration with Gemini thinking model
    config = HybridConfig(
        reasoning_model_name="google/gemini-2.5-flash-preview:thinking",
        response_model_name="google/gemini-2.5-flash-preview",
        reasoning_config=ReasoningConfig(
            enabled=True,
            max_tokens=8000,  # Realistic reasoning token allocation
            exclude=False
        ),
        use_streaming=True
    )

    # Check effective token limits
    token_limits = config.get_effective_token_limits()
    print(f"Effective token limits: {token_limits}")

    # Verify we have adequate space for both reasoning and response
    expected_reasoning = 12000  # Should be 12K for Gemini (within output limits)
    expected_response = 4000    # Should be 4K for Gemini (within output limits)

    assert token_limits["max_reasoning_tokens"] == expected_reasoning, \
        f"Expected {expected_reasoning} reasoning tokens, got {token_limits['max_reasoning_tokens']}"
    assert token_limits["max_response_tokens"] == expected_response, \
        f"Expected {expected_response} response tokens, got {token_limits['max_response_tokens']}"

    print(f"✅ Gemini model configured with {token_limits['max_reasoning_tokens']} reasoning tokens and {token_limits['max_response_tokens']} response tokens")

    # Test with actual reasoning (if API key available)
    try:
        llm_client = LLMClient()
        orchestrator = HybridOrchestrator(llm_client, config)

        problem = "What is the most efficient sorting algorithm for large datasets and why?"

        print(f"\nTesting reasoning extraction with problem: {problem}")
        result = orchestrator.run(problem)

        if result.hybrid_result and result.hybrid_result.succeeded:
            print("✅ Reasoning extraction successful!")
            print(f"Reasoning format: {result.hybrid_result.detected_reasoning_format}")

            if result.hybrid_result.reasoning_call_stats:
                reasoning_tokens = result.hybrid_result.reasoning_call_stats.completion_tokens
                print(f"Reasoning tokens used: {reasoning_tokens}")

                # Check if we have adequate token space
                if reasoning_tokens > 0:
                    utilization = reasoning_tokens / token_limits["max_reasoning_tokens"]
                    print(f"Token utilization: {utilization:.1%}")

                    if utilization > 0.8:
                        print("⚠️  High token utilization - consider increasing limits")
                    else:
                        print("✅ Good token utilization")

            if result.hybrid_result.response_call_stats:
                response_tokens = result.hybrid_result.response_call_stats.completion_tokens
                print(f"Response tokens used: {response_tokens}")
        else:
            print("❌ Reasoning extraction failed")
            if result.hybrid_result:
                print(f"Error: {result.hybrid_result.error_message}")

    except Exception as e:
        print(f"⚠️  Could not test with actual API (probably no API key): {e}")
        print("✅ Configuration test passed anyway")

def test_token_optimization():
    """Test token optimization across different models"""
    print("\n=== Testing Token Optimization Across Models ===")

    # Test configurations for different model types
    test_configs = [
        {
            "name": "Gemini Thinking (High Capacity)",
            "reasoning_model": "google/gemini-2.5-flash-preview:thinking",
            "response_model": "google/gemini-2.5-flash-preview",
            "expected_reasoning_tokens": 12000,
            "expected_response_tokens": 4000,
            "reasoning_config": ReasoningConfig(enabled=True, max_tokens=8000, exclude=False)
        },
        {
            "name": "Claude (Balanced)",
            "reasoning_model": "anthropic/claude-3.5-sonnet-20241022",
            "response_model": "anthropic/claude-3.5-sonnet-20241022",
            "expected_reasoning_tokens": 6000,
            "expected_response_tokens": 2000,
            "reasoning_config": ReasoningConfig(enabled=True, max_tokens=4000, exclude=False)
        },
        {
            "name": "OpenAI o-series (Efficient)",
            "reasoning_model": "openai/o1-preview",
            "response_model": "openai/gpt-4o",
            "expected_reasoning_tokens": 32000,
            "expected_response_tokens": 8000,
            "reasoning_config": ReasoningConfig(enabled=True, effort="high", exclude=False)
        },
        {
            "name": "DeepSeek-R1 (Basic)",
            "reasoning_model": "deepseek/deepseek-r1:nitro",
            "response_model": "deepseek/deepseek-r1:nitro",
            "expected_reasoning_tokens": 6000,
            "expected_response_tokens": 2000,
            "reasoning_config": ReasoningConfig(enabled=True, exclude=False)
        }
    ]

    print("🧪 Testing model-specific token allocation optimization:")
    print("=" * 80)

    for test_config in test_configs:
        print(f"\n🤖 {test_config['name']}")
        print(f"   Reasoning Model: {test_config['reasoning_model']}")
        print(f"   Response Model: {test_config['response_model']}")

        # Create configuration
        config = HybridConfig(
            reasoning_model_name=test_config["reasoning_model"],
            response_model_name=test_config["response_model"],
            reasoning_config=test_config["reasoning_config"],
            use_streaming=True
        )

        # Check token limits
        token_limits = config.get_effective_token_limits()
        effective_reasoning = config.get_effective_reasoning_config(test_config["reasoning_model"])

        print(f"   Token Limits: {token_limits['max_reasoning_tokens']} reasoning + {token_limits['max_response_tokens']} response")
        print(f"   Reasoning Config: {effective_reasoning.to_openrouter_dict() if effective_reasoning else 'None'}")

        # Verify token allocations
        reasoning_ok = token_limits["max_reasoning_tokens"] == test_config["expected_reasoning_tokens"]
        response_ok = token_limits["max_response_tokens"] == test_config["expected_response_tokens"]

        if reasoning_ok and response_ok:
            print("   ✅ Token allocation is optimal")
        else:
            print(f"   ❌ Unexpected allocation: expected {test_config['expected_reasoning_tokens']}+{test_config['expected_response_tokens']}")

        # Calculate token efficiency
        total_allocated = token_limits["max_reasoning_tokens"] + token_limits["max_response_tokens"]
        reasoning_ratio = token_limits["max_reasoning_tokens"] / total_allocated

        print(f"   📊 Token Distribution: {reasoning_ratio:.1%} reasoning, {1-reasoning_ratio:.1%} response")

        # Show reasoning capability
        if effective_reasoning:
            reasoning_dict = effective_reasoning.to_openrouter_dict()
            if "max_tokens" in reasoning_dict:
                reasoning_capacity = reasoning_dict["max_tokens"]
                overhead = token_limits["max_reasoning_tokens"] - reasoning_capacity
                print(f"   🧠 Reasoning Capacity: {reasoning_capacity:,} tokens (+ {overhead:,} overhead)")
            elif "effort" in reasoning_dict:
                print(f"   🧠 Reasoning Effort: {reasoning_dict['effort']} (dynamic allocation)")
            else:
                print(f"   🧠 Reasoning Mode: Basic (no token control)")

        print(f"   💡 Use Case: {get_use_case_description(test_config['name'])}")

    print("\n" + "=" * 80)
    print("📋 Token Optimization Summary:")
    print("   • OpenAI o-series: 32K reasoning + 8K response = 40K total (highest output limits)")
    print("   • Gemini Thinking: 12K reasoning + 4K response = 16K total (within output limits)")
    print("   • Claude: 6K reasoning + 2K response = 8K total (within output limits)")
    print("   • DeepSeek-R1: 6K reasoning + 2K response = 8K total (within output limits)")
    print("\n💡 Allocations are based on actual OpenRouter API output limits, not context windows")

def get_use_case_description(model_name: str) -> str:
    """Get use case description for a model"""
    if "Gemini" in model_name:
        return "Complex reasoning requiring extensive thought processes"
    elif "Claude" in model_name:
        return "Balanced reasoning and response generation"
    elif "OpenAI" in model_name:
        return "Efficient reasoning with controlled token usage"
    elif "DeepSeek" in model_name:
        return "Basic reasoning tasks with minimal token overhead"
    else:
        return "General purpose reasoning"

def test_qwen_prompt_activation():
    """Test Qwen models' prompt-based reasoning activation"""
    print("\n=== Testing Qwen Prompt-Based Reasoning Activation ===")

    # Test different Qwen models
    qwen_models = [
        "qwen/qwen3-4b",
        "qwen/qwen3-14b",
        "qwen/qwen3-32b",
        "qwen/qwq-32b",
        "qwen/qwen3-235b-a22b"
    ]

    for model in qwen_models:
        print(f"\n🧠 Testing {model}:")

        # Create config for this model
        config = HybridConfig(
            reasoning_model_name=model,
            response_model_name="anthropic/claude-3.5-sonnet-20241022",
            reasoning_config=ReasoningConfig(enabled=True)
        )

        # Test reasoning support detection
        support = config.get_model_reasoning_support(model)
        print(f"   ✓ Uses prompt activation: {support['uses_prompt_activation']}")
        print(f"   ✓ Supports effort: {support['supports_effort']}")
        print(f"   ✓ Supports max_tokens: {support['supports_max_tokens']}")

        # Test prompt modification with reasoning enabled
        test_prompt = "Solve this math problem: What is 15 * 23?"
        modified_prompt = config.apply_prompt_based_reasoning(
            prompt=test_prompt,
            model_name=model,
            reasoning_config=ReasoningConfig(enabled=True)
        )
        print(f"   ✓ Enabled prompt: '{modified_prompt}'")
        assert "/think" in modified_prompt, f"Expected /think in prompt for {model}"

        # Test prompt modification with reasoning disabled
        disabled_prompt = config.apply_prompt_based_reasoning(
            prompt=test_prompt,
            model_name=model,
            reasoning_config=ReasoningConfig(enabled=False)
        )
        print(f"   ✓ Disabled prompt: '{disabled_prompt}'")
        assert "/no_think" in disabled_prompt, f"Expected /no_think in prompt for {model}"

        # Test token limits for different model sizes
        token_limits = config.get_model_specific_token_limits(model)
        print(f"   ✓ Reasoning tokens: {token_limits['max_reasoning_tokens']}")
        print(f"   ✓ Response tokens: {token_limits['max_response_tokens']}")

        # Verify larger models get higher token allocations
        if any(size in model.lower() for size in ["32b", "235b", "30b", "qwq"]):
            assert token_limits["max_reasoning_tokens"] >= 12000, f"Large model {model} should have >=12K reasoning tokens"
        else:
            assert token_limits["max_reasoning_tokens"] >= 6000, f"Small model {model} should have >=6K reasoning tokens"

    # Test non-Qwen model doesn't get prompt modification
    print(f"\n🔍 Testing non-Qwen model (should not modify prompt):")
    config = HybridConfig(
        reasoning_model_name="anthropic/claude-3.5-sonnet-20241022",
        response_model_name="anthropic/claude-3.5-sonnet-20241022"
    )

    test_prompt = "Solve this problem"
    modified_prompt = config.apply_prompt_based_reasoning(
        prompt=test_prompt,
        model_name="anthropic/claude-3.5-sonnet-20241022",
        reasoning_config=ReasoningConfig(enabled=True)
    )
    print(f"   ✓ Claude prompt unchanged: '{modified_prompt}'")
    assert modified_prompt == test_prompt, "Non-Qwen models should not have prompt modified"

    print("\n" + "=" * 70)
    print("📋 Qwen Prompt Activation Summary:")
    print("   • Qwen models use /think and /no_think slash commands")
    print("   • Prompt-based activation instead of API headers")
    print("   • Larger models (32B+) get higher token allocations")
    print("   • Automatic detection and activation based on model name")
    print("   • Compatible with existing reasoning extraction")
    print("=" * 70)

def main():
    if len(sys.argv) < 2:
        print("Usage:")
        print("  python test_my_problem.py \"Your problem here\"")
        print("  python test_my_problem.py \"Your problem here\" --debug  # Show full reasoning output")
        print("  python test_my_problem.py --test-models    # Test model-specific configurations")
        print("  python test_my_problem.py --test-defaults  # Test model-specific default configs")
        print("  python test_my_problem.py --test-token-limits  # Test model-specific token limits")
        print("  python test_my_problem.py --test-gemini  # Test reasoning with Gemini model and increased token limits")
        print("  python test_my_problem.py --test-optimization  # Test token optimization across all models")
        print("  python test_my_problem.py --test-qwen-activation  # Test Qwen prompt-based reasoning activation")
        print()
        print("Examples:")
        print("  python test_my_problem.py \"What is 2+2?\"")
        print("  python test_my_problem.py \"Explain quantum computing\" --debug")
        print("  python test_my_problem.py --test-models")
        print("  python test_my_problem.py --test-defaults")
        print("  python test_my_problem.py --test-token-limits")
        print("  python test_my_problem.py --test-gemini")
        print("  python test_my_problem.py --test-optimization")
        print("  python test_my_problem.py --test-qwen-activation")
        print()
        print("Note: Regular mode shows full reasoning output. Debug mode shows additional technical details.")
        sys.exit(1)

    if sys.argv[1] == "--test-models":
        test_different_reasoning_models()
        return

    if sys.argv[1] == "--test-defaults":
        test_model_default_reasoning_configs()
        return

    if sys.argv[1] == "--test-token-limits":
        test_token_limits()
        return

    if sys.argv[1] == "--test-gemini":
        test_reasoning_with_gemini()
        return

    if sys.argv[1] == "--test-optimization":
        test_token_optimization()
        return

    if sys.argv[1] == "--test-qwen-activation":
        test_qwen_prompt_activation()
        return

    problem_text = sys.argv[1]

    # Check for debug flag
    debug_mode = "--debug" in sys.argv

    if debug_mode:
        success = test_hybrid_thinking_debug(problem_text)
    else:
        success = test_hybrid_thinking(problem_text)

    if not success:
        sys.exit(1)

if __name__ == "__main__":
    main()