NLP_ChatBot/rag_system.py at main · jcestefania/NLP_ChatBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import OllamaLLM
from vector_store import VectorStoreService
from sentence_transformers import CrossEncoder

# Modelo LLM
LLM_MODEL = "llama3.2"

# Modelo de Reranking
RERANKER_MODEL = CrossEncoder('BAAI/bge-reranker-v2-m3')

# Prompt principal
PROMP_TEMPLATE = PromptTemplate(
    template="""
        Eres un Chef Asistente Experto y un Bibliotecario Documental con acceso estricto a un recetario cubano.
        Tu única fuente de verdad es el CONTEXTO proporcionado. Está TERMINANTEMENTE PROHIBIDO usar conocimientos externos o inventar medidas.

        REGLAS DE ORO PARA EVITAR ERRORES FÁCTICOS:
        1. IDENTIFICACIÓN DE RECETA: El contexto puede contener fragmentos de varias recetas. Identifica cuál es la receta EXACTA que pide el usuario por su nombre.
        2. FILTRADO DE DATOS: Ignora cualquier ingrediente o paso que pertenezca a una receta distinta a la solicitada, aunque aparezca en el mismo fragmento de texto.
        3. FIDELIDAD NUMÉRICA: Copia las cantidades, medidas y tiempos exactamente como aparecen. No aproximes ni conviertas unidades.
        4. SIN ALUCINACIONES: Si el usuario pregunta por un dato (ej. "tiempo de reposo") y este no aparece explícitamente en el texto, responde: "El documento no especifica esa información". No intentes deducirla.

        PASOS DE PROCESAMIENTO MENTAL:
        - Paso A: ¿Qué receta me están pidiendo?
        - Paso B: ¿Está esa receta en el contexto? (Busca el título o nombre de la receta).
        - Paso C: Extrae solo los datos bajo ese título específico.
        - Paso D: Cita el número de página mencionado en el contexto de esta forma: "Según la página x del recetario...".

        CONTEXTO:
        {context}

        PREGUNTA:
        {question}

        RESPUESTA (Directa, precisa y basada únicamente en el texto):
    """,
    input_variables=["context", "question"]
)

# Prompt para TRADUCIR (Necesario para el inglés)
TRANSLATION_PROMPT = PromptTemplate(
    template="""Translate the following text to {target_language}.
                Output ONLY the translation, no additional comments.
                TEXT: {text}
                TRANSLATION:""",
    input_variables=["target_language", "text"]
)

# Prompt para DETECTAR IDIOMA (Necesario para el inglés)
LANGUAGE_DETECTION_PROMPT = PromptTemplate(
    template="Identify the language of this text. Return ONLY the ISO code (e.g., 'es' for Spanish, 'en' for English). Text: {text}",
    input_variables=["text"]
)

# Prompt para RECOMENDACION
REC_PROMPT = PromptTemplate(
    template="""
    Eres un Chef experto. Basándote en que el usuario acaba de preguntar por una receta de la categoría '{topic}',
    sugiere en 2 frases cortas de forma muy breve y elegante que pruebe estas otras: {suggestions}.
    No uses frases genéricas. Escribe como si fueras un chef real recomendando su menú.
    Idioma: {language}
    Respuesta:""",
    input_variables=["topic", "suggestions", "language"]
)

def get_recommendations_from_database(topic: str, current_recipe: str) -> str:
    """Devuelve las recomendaciones basadas en el topico de la pregunta"""
    try:
        # Obtener recetas del mismo topico
        vectorstore = VectorStoreService()
        recipes = vectorstore.get_similar_recipes(topic)

        # Extraemos nombres y filtramos la receta actual
        titles = [m.get('recipe_name', '') for m in recipes['metadatas']]
        suggestions = [t for t in titles if t.lower() != current_recipe.lower() and t != ''][:3]

        return ", ".join(suggestions) if suggestions else ""
    except:
        return ""


def get_context_and_topic_from_database(question: str, k_initial: int = 10, k_final: int = 5) -> str:
    """Devuelve el contexto y el topico de la base de datos relacionado con la pregunta"""
    vectorstore = VectorStoreService()

    # MANTENEMOS 'similarity_search' PORQUE ES EL QUE NO FALLA
    results = vectorstore.similarity_search(question, k=k_initial)

    # Preparacion de pares para el reranker
    pairs = [[question, doc.metadata.get('raw_page_content', doc.page_content)] for doc in results]

    # Scores de relevancia
    scores = RERANKER_MODEL.predict(pairs)

    # Re-ordenar los resultados basados en el score del Reranker
    for i, score in enumerate(scores):
        results[i].metadata['rerank_score'] = score
    results.sort(key=lambda x: x.metadata['rerank_score'], reverse=True)

    # Obtener mejores resultados
    final_results = results[:k_final]

    # Obtenemos el topico y el nombre de receta principal
    topic = final_results[0].metadata.get('topic_label', 'cocina general')
    recipe_name = final_results[0].metadata.get('recipe_name', '')

    context_text = ""
    for i, doc in enumerate(final_results):
        # MANTENEMOS 'raw_page_content' PARA QUE LEA LA CAJA GRANDE
        full_content = doc.metadata.get('raw_page_content', doc.page_content)

        context_text += f"""
        RECETA {i+1}
        - Página: {doc.metadata.get('page_number', 'Desconocida')}
        - Contenido:
        {full_content}
        ---
        """
    return topic, recipe_name, context_text if context_text else "No se encontró contexto."

def answer_query(question: str) -> str:
    try:
        llm = OllamaLLM(model=LLM_MODEL, temperature=0.0)
    except Exception as e:
        return f"Error conectando con Ollama: {e}"

    print(f"\n--- CHEF BOT: MODO BIBLIOTECARIO ACTIVO ---")

    # --- PASO 1: DETECCIÓN INTELIGENTE ---
    print(f"🕵️ Detectando idioma de: '{question}'...")
    try:
        detector_chain = LANGUAGE_DETECTION_PROMPT | llm | StrOutputParser()
        detected_lang = detector_chain.invoke({"text": question}).strip().lower()
        if "en" in detected_lang: detected_lang = "en"
        elif "es" in detected_lang: detected_lang = "es"
    except:
        detected_lang = "es"

    print(f"🌍 Idioma detectado: {detected_lang}")

    # --- PASO 2: TRADUCIR PREGUNTA (Si es inglés) ---
    search_query = question
    if detected_lang != 'es':
        print("🔤 Traduciendo pregunta al español...")
        translator = TRANSLATION_PROMPT | llm | StrOutputParser()
        search_query = translator.invoke({"target_language": "Spanish", "text": question})
        print(f"🔍 Buscando: '{search_query}'")

    # --- PASO 3: OBTENER CONTEXTO Y TÓPICO ---
    topic, recipe_name, context = get_context_and_topic_from_database(search_query)

    # --- PASO 4: GENERAR RESPUESTA PRINCIPAL ---
    rag_chain = PROMP_TEMPLATE | OllamaLLM(model=LLM_MODEL, temperature=0.0) | StrOutputParser()
    answer = rag_chain.invoke({"context": context, "question": search_query}).strip().strip('"').strip("'")

    # --- PASO 5: GENERAR RECOMENDACIÓN PERSONALIZADA ---
    suggestions_list = get_recommendations_from_database(topic, recipe_name)
    recommendation = None

    if suggestions_list:
        rec_chain = REC_PROMPT | OllamaLLM(model=LLM_MODEL, temperature=0.2) | StrOutputParser()
        recommendation = rec_chain.invoke({
            "topic": topic,
            "suggestions": suggestions_list,
            "language": "español" if detected_lang == 'es' else "inglés"
        }).strip().strip('"').strip("'")

    # Traducción final si es necesario
    if detected_lang != 'es':
        translator = TRANSLATION_PROMPT | OllamaLLM(model=LLM_MODEL) | StrOutputParser()
        answer = translator.invoke({"target_language": "English", "text": answer}).strip().strip('"').strip("'")
        if recommendation:
            recommendation = translator.invoke({"target_language": "English", "text": recommendation}).strip().strip('"').strip("'")

    return {
        "answer": answer,
        "recommendation": recommendation
    }