Genovese-Felipe · google-labs-jules · Oct 10, 2025
diff --git a/AI_Dashboard_Implementation/scripts/data_gen.py b/AI_Dashboard_Implementation/scripts/data_gen.py
diff --git a/AI_Dashboard_Implementation/scripts/viz.py b/AI_Dashboard_Implementation/scripts/viz.py
diff --git a/AI_Knowledge_Extraction_System/__init__.py b/AI_Knowledge_Extraction_System/__init__.py
@@ -1,19 +1,39 @@
 """
 AI Knowledge Extraction System
-Expert-level content processing and semantic analysis for Knowledge-Base materials
+==============================
+
+A comprehensive system for expert-level content processing, semantic analysis,
+and knowledge extraction from a variety of file formats.
+
+This package provides the core components for building a sophisticated
+knowledge pipeline, including content extraction, semantic processing, and
+overall orchestration.
+
+Key Components:
+- `KnowledgeExtractionOrchestrator`: The main coordinator for the entire pipeline.
+- `ContentExtractor`: For extracting text and metadata from files.
+- `SemanticProcessor`: For advanced NLP and semantic analysis.
+- `config`: A centralized configuration object for all system settings.
 """
 
+# The version of the AI Knowledge Extraction System package.
 __version__ = "1.0.0"
+
+# The designated author of the package.
 __author__ = "AI Knowledge Extraction System"
 
+# Import key classes and objects to make them directly accessible from the package level.
+# e.g., from AI_Knowledge_Extraction_System import KnowledgeExtractionOrchestrator
 from .core.orchestrator import KnowledgeExtractionOrchestrator
 from .processors.content_extractor import ContentExtractor
 from .processors.semantic_processor import SemanticProcessor
 from .config.config import config
 
+# Define the public API of the package. When a user writes `from AI_Knowledge_Extraction_System import *`,
+# only these names will be imported.
 __all__ = [
     "KnowledgeExtractionOrchestrator",
-    "ContentExtractor", 
+    "ContentExtractor",
     "SemanticProcessor",
     "config"
 ]
diff --git a/AI_Knowledge_Extraction_System/config/__init__.py b/AI_Knowledge_Extraction_System/config/__init__.py
@@ -1 +1,6 @@
-# Configuration module
+"""
+Configuration Package for the AI Knowledge Extraction System.
+
+This package contains the configuration modules for the system, providing
+a centralized place to manage settings and parameters.
+"""
diff --git a/AI_Knowledge_Extraction_System/config/config.py b/AI_Knowledge_Extraction_System/config/config.py
@@ -1,132 +1,134 @@
 """
-AI Knowledge Extraction System Configuration
-Enhanced configuration for expert-level content processing
+Configuration settings for the AI Knowledge Extraction System.
+
+This module centralizes all configuration parameters for the system,
+including file paths, processing settings, AI model identifiers, and output
+structures. This approach makes it easy to manage and update the system's
+behavior from a single location.
 """
 
-import os
 from pathlib import Path
 from typing import Dict, List, Any
 
 class KnowledgeExtractionConfig:
-    """Configuration class for the Knowledge Extraction System"""
-
-    # Base directories
+    """
+    A configuration class for the AI Knowledge Extraction System.
+
+    This class encapsulates all static settings, from directory paths to
+    complex dictionaries defining AI model parameters and data schemas.
+    """
+
+    # --- Core Paths ---
+    # Defines the base directories used throughout the system for input and output.
     BASE_DIR = Path(__file__).parent.parent.parent
     KNOWLEDGE_BASE_DIR = BASE_DIR / "Knowledge-Base"
     OUTPUT_DIR = BASE_DIR / "AI_Knowledge_Extraction_System" / "outputs"
-
-    # Processing configuration
+
+    # --- Document Processing Parameters ---
+    # Governs how files are read, chunked, and processed.
     PROCESSING_CONFIG = {
-        "chunk_size": 1000,
-        "chunk_overlap": 200,
-        "max_file_size_mb": 100,
-        "enable_ocr": True,
-        "enable_image_analysis": True,
-        "enable_vector_embeddings": True,
-        "enable_knowledge_graph": True,
-        "enable_semantic_labeling": True,
+        "chunk_size": 1000,  # Size of text chunks for processing
+        "chunk_overlap": 200,  # Overlap between chunks to maintain context
+        "max_file_size_mb": 100,  # Maximum file size to process
+        "enable_ocr": True,  # Enable Optical Character Recognition for images/PDFs
+        "enable_image_analysis": True,  # Enable analysis of image content
+        "enable_vector_embeddings": True,  # Generate vector embeddings for text
+        "enable_knowledge_graph": True,  # Construct a knowledge graph
+        "enable_semantic_labeling": True,  # Automatically label content
     }
-
-    # File type configurations
+
+    # --- Supported File Types ---
+    # A dictionary categorizing all file extensions the system is designed to handle.
     SUPPORTED_FILE_TYPES = {
         "documents": [".pdf", ".txt", ".md", ".doc", ".docx"],
         "code": [".py", ".js", ".jsx", ".html", ".css", ".xml", ".json"],
         "images": [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".svg"],
         "notebooks": [".ipynb"],
         "data": [".csv", ".json", ".xml", ".yaml", ".yml"]
     }
-
-    # Metadata schema
+
+    # --- Metadata Schema ---
+    # Defines the structure and data types for the metadata to be extracted from each file.
     METADATA_SCHEMA = {
         "file_info": {
-            "filename": str,
-            "file_path": str,
-            "file_type": str,
-            "file_size": int,
-            "creation_date": str,
-            "modification_date": str,
-            "hash": str
+            "filename": str, "file_path": str, "file_type": str, "file_size": int,
+            "creation_date": str, "modification_date": str, "hash": str
         },
         "content_info": {
-            "content_type": str,
-            "language": str,
-            "char_count": int,
-            "word_count": int,
-            "line_count": int,
-            "encoding": str
+            "content_type": str, "language": str, "char_count": int,
+            "word_count": int, "line_count": int, "encoding": str
         },
         "semantic_info": {
-            "topics": List[str],
-            "keywords": List[str],
-            "entities": List[Dict[str, Any]],
-            "sentiment": float,
-            "complexity_score": float,
-            "domain_classification": str
+            "topics": List[str], "keywords": List[str], "entities": List[Dict[str, Any]],
+            "sentiment": float, "complexity_score": float, "domain_classification": str
         },
         "ai_processing": {
-            "embedding_model": str,
-            "embedding_dimension": int,
-            "chunk_count": int,
-            "processing_timestamp": str,
-            "processing_version": str
+            "embedding_model": str, "embedding_dimension": int, "chunk_count": int,
+            "processing_timestamp": str, "processing_version": str
         }
     }
-
-    # Output structure
+
+    # --- Output Directory Structure ---
+    # Defines the names of the subdirectories for storing processed outputs.
     OUTPUT_STRUCTURE = {
         "processed_content": "processed_documents",
-        "embeddings": "vector_embeddings", 
+        "embeddings": "vector_embeddings",
         "knowledge_graph": "knowledge_graphs",
         "metadata": "metadata_catalog",
         "indexes": "search_indexes",
         "summaries": "content_summaries",
         "ai_ready": "ai_training_data"
     }
-
-    # AI Model configurations
+
+    # --- AI Model Configurations ---
+    # Specifies the pre-trained models to be used for various AI tasks.
     AI_MODEL_CONFIG = {
-        "embedding_model": "all-MiniLM-L6-v2",  # Lightweight but effective
-        "chunk_embedding_model": "all-mpnet-base-v2",  # Better for semantic search
-        "classification_model": "distilbert-base-uncased",
-        "summarization_model": "facebook/bart-large-cnn",
-        "max_tokens": 512,
-        "similarity_threshold": 0.7
+        "embedding_model": "all-MiniLM-L6-v2",  # For generating document embeddings
+        "chunk_embedding_model": "all-mpnet-base-v2",  # For more detailed semantic search
+        "classification_model": "distilbert-base-uncased",  # For content classification
+        "summarization_model": "facebook/bart-large-cnn",  # For generating summaries
+        "max_tokens": 512,  # Max tokens for model inputs
+        "similarity_threshold": 0.7  # Threshold for semantic similarity searches
     }
-
-    # Knowledge graph configuration
+
+    # --- Knowledge Graph Configuration ---
+    # Parameters for building the knowledge graph from extracted entities and concepts.
     KNOWLEDGE_GRAPH_CONFIG = {
         "node_types": ["document", "concept", "entity", "topic", "code_snippet", "image"],
         "relationship_types": ["references", "contains", "similar_to", "part_of", "implements", "describes"],
-        "min_edge_weight": 0.5,
-        "max_nodes_per_document": 50
+        "min_edge_weight": 0.5,  # Minimum confidence to create a relationship
+        "max_nodes_per_document": 50  # Max graph nodes to extract from a single document
     }
-
-    # Semantic labeling configuration
+
+    # --- Semantic Labeling Configuration ---
+    # Defines the categories and thresholds for automatic content labeling.
     SEMANTIC_LABELING_CONFIG = {
         "domain_categories": [
             "data_visualization", "dashboard_development", "plotly_dash",
             "python_programming", "machine_learning", "data_analysis",
             "business_intelligence", "ui_ux_design", "technical_documentation"
         ],
         "content_types": [
-            "tutorial", "reference", "example", "best_practice", 
+            "tutorial", "reference", "example", "best_practice",
             "troubleshooting", "api_documentation", "code_sample"
         ],
         "difficulty_levels": ["beginner", "intermediate", "advanced", "expert"],
-        "auto_tag_threshold": 0.6
+        "auto_tag_threshold": 0.6  # Confidence threshold for applying a tag
     }
-
-    # Quality assurance
+
+    # --- Quality Assurance Parameters ---
+    # Defines rules and checks to ensure the quality of the processed data.
     QUALITY_CONFIG = {
-        "min_content_length": 100,
-        "max_processing_time_per_file": 300,  # seconds
-        "validation_checks": [
+        "min_content_length": 100,  # Minimum characters for a document to be processed
+        "max_processing_time_per_file": 300,  # Max seconds per file to prevent hangs
+        "validation_checks": [  # List of quality checks to perform
             "content_extraction",
-            "metadata_completeness", 
+            "metadata_completeness",
             "embedding_generation",
             "graph_connectivity"
         ]
     }
 
-# Global configuration instance
+# Create a global instance of the configuration class for easy access
+# across the application.
 config = KnowledgeExtractionConfig()
diff --git a/AI_Knowledge_Extraction_System/core/__init__.py b/AI_Knowledge_Extraction_System/core/__init__.py
@@ -1 +1,6 @@
-# Core module
+"""
+Core Package for the AI Knowledge Extraction System.
+
+This package contains the central components that orchestrate and manage the
+knowledge extraction and processing pipeline.
+"""