Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
75d1d1e
Merge pull request #117 from BibleNLP/develop
woodwardmw Nov 13, 2023
9178c6b
api changes made to Show more information about source documents
Dec 8, 2023
05b56da
Merge branch 'develop' of https://github.com/BibleNLP/assistant.bible…
Dec 8, 2023
14896d7
lynting issue resolved
Dec 10, 2023
43cfd05
handle no OPENAI key case in tests
kavitharaju Dec 11, 2023
fc0b6e2
Fix linting issues
kavitharaju Dec 11, 2023
46158a8
Merge pull request #122 from Jayasankar-kk/develop
kavitharaju Dec 11, 2023
e94af8e
Merge branch 'develop' of https://github.com/BibleNLP/assitant.bible …
kavitharaju Dec 11, 2023
aeb9d61
Merge pull request #123 from kavitharaju/test-chat-endpoint
kavitharaju Dec 16, 2023
fd2e90a
Add AQuA docs
woodwardmw Jan 22, 2024
61a4233
Add AQuA docs javascript
woodwardmw Jan 22, 2024
0bd47b6
Replace null tokens and new lines in imported text
woodwardmw Jan 23, 2024
15410bd
Adding in translation
woodwardmw Feb 20, 2024
a3b1ca9
using aws translator instead of gcp
alejandroaquintero Feb 20, 2024
4f0c82e
Add translation in. Get LLM model from env var
woodwardmw Feb 22, 2024
d6ea04b
Fix most linting issues
woodwardmw Feb 22, 2024
68597f8
Change Faith and Farming label
woodwardmw Feb 22, 2024
52c0134
Remove try/except block
woodwardmw Feb 22, 2024
e26f598
Update docstring
woodwardmw Feb 22, 2024
de7c17b
Initial commit
woodwardmw Feb 22, 2024
8951d2d
Merge branch 'develop' into add-translation-in
woodwardmw Feb 22, 2024
9b7189d
Linting
woodwardmw Feb 22, 2024
fd8664a
Merge branch 'develop' of https://github.com/BibleNLP/assitant.bible …
woodwardmw Feb 22, 2024
e193528
Merge branch 'develop' into add-translation-in
woodwardmw Feb 22, 2024
2e08c8f
Trigger CI build
alejandroaquintero Feb 23, 2024
078eaae
only in push develop and main
alejandroaquintero Feb 23, 2024
8c16587
add region
alejandroaquintero Feb 23, 2024
46f071e
mocking translate
alejandroaquintero Feb 23, 2024
d22210b
mocking translate from router
alejandroaquintero Feb 23, 2024
4e92361
trying to add he mock before client creation
alejandroaquintero Feb 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions .github/workflows/check_on_push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,16 @@

name: linting

# Triggers the workflow on push or pull request
on: [push, pull_request ]
# Triggers the workflow on push or pull request for branches develop and main
on:
push:
branches:
- develop
- main
pull_request:
branches:
- develop
- main

# Jobs can run in parallel
jobs:
Expand Down
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python.testing.pytestArgs": [
"app"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
4 changes: 2 additions & 2 deletions app/core/llm_framework/openai_langchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ class LangchainOpenAI(LLMFrameworkInterface):
def __init__(
self, # pylint: disable=super-init-not-called
# FIXME : Ideal to be able to mock the __init__ from tests
key: str = os.getenv("OPENAI_API_KEY"),
model_name: str = "gpt-3.5-turbo",
key: str = os.getenv("OPENAI_API_KEY", "dummy-for-test"),
model_name: str = os.getenv("OPENAI_LLM_NAME", "gpt-3.5-turbo"),
vectordb: VectordbInterface = Chroma(),
max_tokens_limit: int = int(
os.getenv("OPENAI_MAX_TOKEN_LIMIT", "3052")),
Expand Down
27 changes: 15 additions & 12 deletions app/core/llm_framework/openai_vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,18 @@ def get_context(source_documents):
return context


def get_pre_prompt(context):
def get_pre_prompt(context, response_language="English"):
"""Constructs a pre-prompt for the conversation, including the context"""
chat_prefix = "The following is a conversation with an AI assistant for "
chat_prefix += "Bible translators. The assistant is"
chat_prefix += "verbose, helpful, creative, clever, very friendly and follows instructions carefully,"
chat_prefix += "verbose, helpful, creative, clever, very friendly "
chat_prefix += "and follows instructions carefully,"
chat_prefix += "giving as much information as possible.\n"
prompt = (
chat_prefix
+ "Read the paragraph below and answer the question, using only the information"
" in the context delimited by triple backticks. "
f" Your response should be in the {response_language} language."
"If the question cannot be answered based on the context alone, "
'write "Sorry, I had trouble answering this question based on the '
"information I found\n"
Expand Down Expand Up @@ -78,7 +80,7 @@ class OpenAIVanilla(LLMFrameworkInterface): # pylint: disable=too-few-public-me
def __init__(
self, # pylint: disable=super-init-not-called
key: str = os.getenv("OPENAI_API_KEY"),
model_name: str = "gpt-3.5-turbo-1106",
model_name: str = os.getenv("OPENAI_LLM_NAME", "gpt-3.5-turbo"),
vectordb: VectordbInterface = None, # What should this be by default?
) -> None:
"""Sets the API key and initializes library objects if any"""
Expand All @@ -93,22 +95,24 @@ def __init__(
self.vectordb = vectordb

def generate_text(
self, query: str, chat_history: List[Tuple[str, str]], **kwargs
self,
query: str,
chat_history: List[Tuple[str, str]],
response_language: str = "English",
**kwargs,
) -> dict:
"""Prompt completion for QA or Chat reponse, based on specific documents,
if provided"""
if len(kwargs) > 0:
log.warning(
"Unused arguments in VanillaOpenAI.generate_text(): ", **kwargs)
log.warning("Unused arguments in VanillaOpenAI.generate_text(): ", **kwargs)

# Vectordb results are currently returned based on the whole chat history.
# We'll need to figure out if this is optimal or not.
query_text = "\n".join(
[x[0] + "/n" + x[1][:50] + "\n" for x in chat_history])
query_text = "\n".join([x[0] + "/n" + x[1][:50] + "\n" for x in chat_history])
query_text += "\n" + query
source_documents = self.vectordb._get_relevant_documents(query_text) #pylint: disable=protected-access
source_documents = self.vectordb._get_relevant_documents(query_text) # pylint: disable=protected-access
context = get_context(source_documents)
pre_prompt = get_pre_prompt(context)
pre_prompt = get_pre_prompt(context, response_language=response_language)
prompt = append_query_to_prompt(pre_prompt, query, chat_history)
print(f"{prompt=}")

Expand All @@ -125,5 +129,4 @@ def generate_text(
}

except Exception as exe:
raise OpenAIException(
"While generating answer: " + str(exe)) from exe
raise OpenAIException("While generating answer: " + str(exe)) from exe
42 changes: 42 additions & 0 deletions app/core/translation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Translating the user's input to English using AWS Translate service"""

import json

import boto3

with open("../iso639-1.json", "r", encoding="utf-8") as f:
iso_639_1 = json.load(f)


def translate_text(text: str):
"""
Translates the given text to English using AWS Translate service
:param text: The text to translate
:return: The response, including the translated text
"""
# add region as us_east_1
translate = boto3.client(
service_name="translate", region_name="us-east-1", use_ssl=True
)
# Call AWS Translate
response = translate.translate_text(
Text=text,
SourceLanguageCode="auto", # Automatically detect the source language
TargetLanguageCode="en", # Target language code is 'en' for English
)

source_language_code = response.get("SourceLanguageCode", "").split("-")[
0
] # Extracting ISO 639-1 code part
response["language"] = iso_639_1.get(source_language_code, "Unknown language")

return response


if __name__ == "__main__":
# Example text to translate
TEST_TEXT = "Hola, mundo"
# Call the translate_text function
result = translate_text(TEST_TEXT)
# Print the result
print(result)
89 changes: 48 additions & 41 deletions app/core/vectordb/postgres4langchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
MAX_COSINE_DISTANCE = os.getenv("POSTGRES_MAX_COSINE_DISTANCE", "0.1")


class Postgres(
VectordbInterface, BaseRetriever
): # pylint: disable=too-many-instance-attributes
class Postgres(VectordbInterface, BaseRetriever): # pylint: disable=too-many-instance-attributes
"""Interface for vector database technology, its connection, configs and operations"""

db_host: str = os.environ.get("POSTGRES_DB_HOST", "localhost")
Expand All @@ -35,18 +33,19 @@ class Postgres(
db_password: str = os.environ.get("POSTGRES_DB_PASSWORD", "secret")
embedding: EmbeddingInterface = None
db_client: Any = None
db_conn:Any = None
labels:List[str] = []
query_limit:int = None
db_conn: Any = None
labels: List[str] = []
query_limit: int = None
max_cosine_distance: str = None

def __init__(
self,
embedding: EmbeddingInterface = None,
host:str=None,
port:int=None,
path:str=None,
collection_name:str=None,
**kwargs:str,
host: str = None,
port: int = None,
path: str = None,
collection_name: str = None,
**kwargs: str,
) -> None:
"""Instantiate a chroma client"""
VectordbInterface.__init__(self, host, port, path, collection_name)
Expand Down Expand Up @@ -129,25 +128,29 @@ def __init__(
cur.close()
self.db_conn.commit()
except Exception as exe:
raise PostgresException(
"While initializing client: " + str(exe)) from exe
raise PostgresException("While initializing client: " + str(exe)) from exe

def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
"""Loads the document object as per chroma DB formats into the collection"""
data_list = []
for doc in docs:
doc.text = (doc.text
.replace("\n", " ")
.replace("\r", " ")
.replace("\t", " ")
.replace('\x00', '')
doc.text = (
doc.text.replace("\n", " ")
.replace("\r", " ")
.replace("\t", " ")
.replace("\x00", "")
)
cur = self.db_conn.cursor()
cur.execute(
"SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,))
cur.execute("SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,))
doc_id_already_exists = cur.fetchone()
links= ",".join([str(item) for item in doc.links])
doc.text = doc.text.replace('\0', '').replace('\x00', '').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
links = ",".join([str(item) for item in doc.links])
doc.text = (
doc.text.replace("\0", "")
.replace("\x00", "")
.replace("\n", " ")
.replace("\r", " ")
.replace("\t", " ")
)
if not doc_id_already_exists:
data_list.append(
[
Expand Down Expand Up @@ -207,16 +210,18 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
cur.close()

def _get_relevant_documents(
self, query: list, run_manager: CallbackManagerForRetrieverRun| None = None, **kwargs
self,
query: list,
run_manager: CallbackManagerForRetrieverRun | None = None,
**kwargs,
) -> List[LangchainDocument]:
"""Similarity search on the vector store"""
query_doc = schema.Document(docId="xxx", text=query)
try:
self.embedding.get_embeddings(doc_list=[query_doc])
query_vector = query_doc.embedding
except Exception as exe:
raise GenericException(
"While vectorising the query: " + str(exe)) from exe
raise GenericException("While vectorising the query: " + str(exe)) from exe
try:
cur = self.db_conn.cursor()
cur.execute(
Expand Down Expand Up @@ -251,31 +256,36 @@ def _get_relevant_documents(
"This question can't be answered, but the user could try "
"rewording or asking something else."
),
metadata={
"source": "no records found"
},
metadata={"source": "no records found"},
)
]
return [
LangchainDocument(page_content=doc[1], metadata={"label": doc[0],
"media": doc[1],
'link':doc[2],
'source_id':doc[3],
'document':doc[4]})
LangchainDocument(
page_content=doc[1],
metadata={
"label": doc[0],
"media": doc[1],
"link": doc[2],
"source_id": doc[3],
"document": doc[4],
},
)
for doc in records
]

async def _aget_relevant_documents(
self, query: list, run_manager: AsyncCallbackManagerForRetrieverRun| None = None, **kwargs
self,
query: list,
run_manager: AsyncCallbackManagerForRetrieverRun | None = None,
**kwargs,
) -> List[LangchainDocument]:
"""Similarity search on the vector store"""
query_doc = schema.Document(docId="xxx", text=query)
try:
self.embedding.get_embeddings(doc_list=[query_doc])
query_vector = query_doc.embedding
except Exception as exe:
raise GenericException(
"While vectorising the query: " + str(exe)) from exe
raise GenericException("While vectorising the query: " + str(exe)) from exe
try:
cur = self.db_conn.cursor()
cur.execute(
Expand Down Expand Up @@ -311,9 +321,7 @@ async def _aget_relevant_documents(
"This question can't be answered, but the user could try "
"rewording or asking something else."
),
metadata={
"source": "no records found"
},
metadata={"source": "no records found"},
)
]
return [
Expand All @@ -330,7 +338,6 @@ def get_available_labels(self) -> List[str]:
records = cur.fetchall()
cur.close()
except Exception as exe:
raise PostgresException(
"While querying for labels: " + str(exe)) from exe
raise PostgresException("While querying for labels: " + str(exe)) from exe
labels = [row[0] for row in records]
return labels
Loading