BibleNLP · alejandroaquintero · Nov 13, 2023 · Dec 8, 2023 · Dec 8, 2023 · Dec 10, 2023
diff --git a/.github/workflows/check_on_push.yml b/.github/workflows/check_on_push.yml
@@ -2,8 +2,16 @@
 
 name: linting
 
-# Triggers the workflow on push or pull request
-on: [push, pull_request ]
+# Triggers the workflow on push or pull request for branches develop and main
+on:
+  push:
+    branches:
+      - develop
+      - main
+  pull_request:
+    branches:
+      - develop
+      - main
 
 # Jobs can run in parallel
 jobs:

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.testing.pytestArgs": [
+        "app"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
diff --git a/app/core/llm_framework/openai_langchain.py b/app/core/llm_framework/openai_langchain.py
@@ -33,8 +33,8 @@ class LangchainOpenAI(LLMFrameworkInterface):
     def __init__(
         self,  # pylint: disable=super-init-not-called
         # FIXME : Ideal to be able to mock the __init__ from tests
-        key: str = os.getenv("OPENAI_API_KEY"),
-        model_name: str = "gpt-3.5-turbo",
+        key: str = os.getenv("OPENAI_API_KEY", "dummy-for-test"),
+        model_name: str = os.getenv("OPENAI_LLM_NAME", "gpt-3.5-turbo"),
         vectordb: VectordbInterface = Chroma(),
         max_tokens_limit: int = int(
             os.getenv("OPENAI_MAX_TOKEN_LIMIT", "3052")),

diff --git a/app/core/llm_framework/openai_vanilla.py b/app/core/llm_framework/openai_vanilla.py
@@ -32,16 +32,18 @@ def get_context(source_documents):
     return context
 
 
-def get_pre_prompt(context):
+def get_pre_prompt(context, response_language="English"):
     """Constructs a pre-prompt for the conversation, including the context"""
     chat_prefix = "The following is a conversation with an AI assistant for "
     chat_prefix += "Bible translators. The assistant is"
-    chat_prefix += "verbose, helpful, creative, clever, very friendly and follows instructions carefully,"
+    chat_prefix += "verbose, helpful, creative, clever, very friendly "
+    chat_prefix += "and follows instructions carefully,"
     chat_prefix += "giving as much information as possible.\n"
     prompt = (
         chat_prefix
         + "Read the paragraph below and answer the question, using only the information"
         " in the context delimited by triple backticks. "
+        f" Your response should be in the {response_language} language."
         "If the question cannot be answered based on the context alone, "
         'write "Sorry, I had trouble answering this question based on the '
         "information I found\n"
@@ -78,7 +80,7 @@ class OpenAIVanilla(LLMFrameworkInterface):  # pylint: disable=too-few-public-me
     def __init__(
         self,  # pylint: disable=super-init-not-called
         key: str = os.getenv("OPENAI_API_KEY"),
-        model_name: str = "gpt-3.5-turbo-1106",
+        model_name: str = os.getenv("OPENAI_LLM_NAME", "gpt-3.5-turbo"),
         vectordb: VectordbInterface = None,  # What should this be by default?
     ) -> None:
         """Sets the API key and initializes library objects if any"""
@@ -93,22 +95,24 @@ def __init__(
         self.vectordb = vectordb
 
     def generate_text(
-        self, query: str, chat_history: List[Tuple[str, str]], **kwargs
+        self,
+        query: str,
+        chat_history: List[Tuple[str, str]],
+        response_language: str = "English",
+        **kwargs,
     ) -> dict:
         """Prompt completion for QA or Chat reponse, based on specific documents,
         if provided"""
         if len(kwargs) > 0:
-            log.warning(
-                "Unused arguments in VanillaOpenAI.generate_text(): ", **kwargs)
+            log.warning("Unused arguments in VanillaOpenAI.generate_text(): ", **kwargs)
 
         # Vectordb results are currently returned based on the whole chat history.
         # We'll need to figure out if this is optimal or not.
-        query_text = "\n".join(
-            [x[0] + "/n" + x[1][:50] + "\n" for x in chat_history])
+        query_text = "\n".join([x[0] + "/n" + x[1][:50] + "\n" for x in chat_history])
         query_text += "\n" + query
-        source_documents = self.vectordb._get_relevant_documents(query_text) #pylint: disable=protected-access
+        source_documents = self.vectordb._get_relevant_documents(query_text)  # pylint: disable=protected-access
         context = get_context(source_documents)
-        pre_prompt = get_pre_prompt(context)
+        pre_prompt = get_pre_prompt(context, response_language=response_language)
         prompt = append_query_to_prompt(pre_prompt, query, chat_history)
         print(f"{prompt=}")
 
@@ -125,5 +129,4 @@ def generate_text(
             }
 
         except Exception as exe:
-            raise OpenAIException(
-                "While generating answer: " + str(exe)) from exe
+            raise OpenAIException("While generating answer: " + str(exe)) from exe
diff --git a/app/core/translation/__init__.py b/app/core/translation/__init__.py
@@ -0,0 +1,42 @@
+"""Translating the user's input to English using AWS Translate service"""
+
+import json
+
+import boto3
+
+with open("../iso639-1.json", "r", encoding="utf-8") as f:
+    iso_639_1 = json.load(f)
+
+
+def translate_text(text: str):
+    """
+    Translates the given text to English using AWS Translate service
+    :param text: The text to translate
+    :return: The response, including the translated text
+    """
+    # add region as us_east_1
+    translate = boto3.client(
+        service_name="translate", region_name="us-east-1", use_ssl=True
+    )
+    # Call AWS Translate
+    response = translate.translate_text(
+        Text=text,
+        SourceLanguageCode="auto",  # Automatically detect the source language
+        TargetLanguageCode="en",  # Target language code is 'en' for English
+    )
+
+    source_language_code = response.get("SourceLanguageCode", "").split("-")[
+        0
+    ]  # Extracting ISO 639-1 code part
+    response["language"] = iso_639_1.get(source_language_code, "Unknown language")
+
+    return response
+
+
+if __name__ == "__main__":
+    # Example text to translate
+    TEST_TEXT = "Hola, mundo"
+    # Call the translate_text function
+    result = translate_text(TEST_TEXT)
+    # Print the result
+    print(result)
diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py
@@ -22,9 +22,7 @@
 MAX_COSINE_DISTANCE = os.getenv("POSTGRES_MAX_COSINE_DISTANCE", "0.1")
 
 
-class Postgres(
-    VectordbInterface, BaseRetriever
-):  # pylint: disable=too-many-instance-attributes
+class Postgres(VectordbInterface, BaseRetriever):  # pylint: disable=too-many-instance-attributes
     """Interface for vector database technology, its connection, configs and operations"""
 
     db_host: str = os.environ.get("POSTGRES_DB_HOST", "localhost")
@@ -35,18 +33,19 @@ class Postgres(
     db_password: str = os.environ.get("POSTGRES_DB_PASSWORD", "secret")
     embedding: EmbeddingInterface = None
     db_client: Any = None
-    db_conn:Any = None
-    labels:List[str] = []
-    query_limit:int = None
+    db_conn: Any = None
+    labels: List[str] = []
+    query_limit: int = None
     max_cosine_distance: str = None
+
     def __init__(
         self,
         embedding: EmbeddingInterface = None,
-        host:str=None,
-        port:int=None,
-        path:str=None,
-        collection_name:str=None,
-        **kwargs:str,
+        host: str = None,
+        port: int = None,
+        path: str = None,
+        collection_name: str = None,
+        **kwargs: str,
     ) -> None:
         """Instantiate a chroma client"""
         VectordbInterface.__init__(self, host, port, path, collection_name)
@@ -129,25 +128,29 @@ def __init__(
             cur.close()
             self.db_conn.commit()
         except Exception as exe:
-            raise PostgresException(
-                "While initializing client: " + str(exe)) from exe
+            raise PostgresException("While initializing client: " + str(exe)) from exe
 
     def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
         """Loads the document object as per chroma DB formats into the collection"""
         data_list = []
         for doc in docs:
-            doc.text = (doc.text
-                        .replace("\n", " ")
-                        .replace("\r", " ")
-                        .replace("\t", " ")
-                        .replace('\x00', '')
+            doc.text = (
+                doc.text.replace("\n", " ")
+                .replace("\r", " ")
+                .replace("\t", " ")
+                .replace("\x00", "")
             )
             cur = self.db_conn.cursor()
-            cur.execute(
-                "SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,))
+            cur.execute("SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,))
             doc_id_already_exists = cur.fetchone()
-            links= ",".join([str(item) for item in doc.links])
-            doc.text = doc.text.replace('\0', '').replace('\x00', '').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
+            links = ",".join([str(item) for item in doc.links])
+            doc.text = (
+                doc.text.replace("\0", "")
+                .replace("\x00", "")
+                .replace("\n", " ")
+                .replace("\r", " ")
+                .replace("\t", " ")
+            )
             if not doc_id_already_exists:
                 data_list.append(
                     [
@@ -207,16 +210,18 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
         cur.close()
 
     def _get_relevant_documents(
-        self, query: list, run_manager: CallbackManagerForRetrieverRun| None = None, **kwargs
+        self,
+        query: list,
+        run_manager: CallbackManagerForRetrieverRun | None = None,
+        **kwargs,
     ) -> List[LangchainDocument]:
         """Similarity search on the vector store"""
         query_doc = schema.Document(docId="xxx", text=query)
         try:
             self.embedding.get_embeddings(doc_list=[query_doc])
             query_vector = query_doc.embedding
         except Exception as exe:
-            raise GenericException(
-                "While vectorising the query: " + str(exe)) from exe
+            raise GenericException("While vectorising the query: " + str(exe)) from exe
         try:
             cur = self.db_conn.cursor()
             cur.execute(
@@ -251,31 +256,36 @@ def _get_relevant_documents(
                         "This question can't be answered, but the user could try "
                         "rewording or asking something else."
                     ),
-                    metadata={
-                        "source": "no records found"
-                    },
+                    metadata={"source": "no records found"},
                 )
             ]
         return [
-            LangchainDocument(page_content=doc[1], metadata={"label": doc[0],
-                                                             "media": doc[1],
-                                                             'link':doc[2],
-                                                            'source_id':doc[3],
-                                                            'document':doc[4]})
+            LangchainDocument(
+                page_content=doc[1],
+                metadata={
+                    "label": doc[0],
+                    "media": doc[1],
+                    "link": doc[2],
+                    "source_id": doc[3],
+                    "document": doc[4],
+                },
+            )
             for doc in records
         ]
 
     async def _aget_relevant_documents(
-        self, query: list, run_manager: AsyncCallbackManagerForRetrieverRun| None = None, **kwargs
+        self,
+        query: list,
+        run_manager: AsyncCallbackManagerForRetrieverRun | None = None,
+        **kwargs,
     ) -> List[LangchainDocument]:
         """Similarity search on the vector store"""
         query_doc = schema.Document(docId="xxx", text=query)
         try:
             self.embedding.get_embeddings(doc_list=[query_doc])
             query_vector = query_doc.embedding
         except Exception as exe:
-            raise GenericException(
-                "While vectorising the query: " + str(exe)) from exe
+            raise GenericException("While vectorising the query: " + str(exe)) from exe
         try:
             cur = self.db_conn.cursor()
             cur.execute(
@@ -311,9 +321,7 @@ async def _aget_relevant_documents(
                         "This question can't be answered, but the user could try "
                         "rewording or asking something else."
                     ),
-                    metadata={
-                        "source": "no records found"
-                    },
+                    metadata={"source": "no records found"},
                 )
             ]
         return [
@@ -330,7 +338,6 @@ def get_available_labels(self) -> List[str]:
             records = cur.fetchall()
             cur.close()
         except Exception as exe:
-            raise PostgresException(
-                "While querying for labels: " + str(exe)) from exe
+            raise PostgresException("While querying for labels: " + str(exe)) from exe
         labels = [row[0] for row in records]
         return labels