AskMyPaper/main.py at main · usk2003/AskMyPaper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import os
import pickle
import streamlit as st
from PyPDF2 import PdfReader
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import google.generativeai as genai
import hashlib
import re
from langchain.schema import Document
from dotenv import load_dotenv
import requests
from io import BytesIO

load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

def parse_pdf(pdf_file_object):
    """Parses a PDF file object and returns its text content."""
    reader = PdfReader(pdf_file_object)
    full_text = ""
    for page in reader.pages:
        content = page.extract_text()
        if content:
            full_text += content + "\n"
    return full_text

def get_answer_from_gemini(context, question):
    """
    Asks Gemini to answer a question based on provided context.
    The prompt is tailored for factual extraction and summarization from research papers.
    """
    model = genai.GenerativeModel("gemini-2.5-flash-preview-04-17")
    prompt = f"""You are a highly analytical research assistant. Your task is to extract or synthesize information directly from the provided 'Context' to answer the user's 'Question'.

    Focus strictly on the factual content within the context. Do not invent information, speculate, or bring in outside knowledge. If the context does not contain enough information to fully answer the question, state that clearly and concisely.

    If the question asks for a comparison or details across multiple sources, synthesize the information from the relevant parts of the context.

    Context:
    {context}

    Question:
    {question}

    Answer:"""
    response = model.generate_content(prompt)
    return response.text.strip()

def extract_references(text):
    doi_pattern = r"\b10\.\d{4,9}/[^\s.]+\b"

    arxiv_pattern_1 = r"\barxiv:(\d{4}\.\d{4,5})\b"
    arxiv_pattern_2 = r"10\.48550/arXiv\.(\d{4}\.\d{4,5})\b"

    dois = re.findall(doi_pattern, text, re.IGNORECASE)
    arxiv_ids_1 = re.findall(arxiv_pattern_1, text, re.IGNORECASE)
    arxiv_ids_2 = re.findall(arxiv_pattern_2, text, re.IGNORECASE)

    arxiv_ids = [f"arxiv:{aid}" for aid in set(arxiv_ids_1 + arxiv_ids_2)]

    return list(set(dois + arxiv_ids))

def download_pdf_from_reference(ref):
    try:
        if "arxiv" in ref.lower():
            match = re.search(r"(\d{4}\.\d{4,5})", ref)
            if not match:
                return None
            arxiv_id = match.group(1)
            url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
            response = requests.get(url)
            if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("application/pdf"):
                return BytesIO(response.content)
            # else:
                # print(f"DEBUG: arXiv download failed for {ref}: Status {response.status_code}, Type {response.headers.get('Content-Type')}")
        elif ref.startswith("10."):
            url = f"https://doi.org/{ref}"
            headers = {"Accept": "application/pdf"}
            # print(f"DEBUG: Attempting DOI download from: {url}")
            response = requests.get(url, headers=headers, allow_redirects=True)
            # print(f"DEBUG: DOI Response Status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
            if "application/pdf" in response.headers.get("Content-Type", ""):
                return BytesIO(response.content)
            # else:
                # print(f"DEBUG: DOI {ref} did not return a direct PDF. Content-Type: {response.headers.get('Content-Type')}. URL: {response.url}")
        return None
    except Exception as e:
        # print(f"DEBUG: Download error for {ref}: {e}")
        return None


def main():
    st.set_page_config(layout="centered", page_title="AskMyPaper Q&A")
    with st.sidebar:
        st.title('🔬 AskMyPaper')
        st.markdown('''
        Upload your research papers to quickly find answers, extract key details,
        and compare information across your document collection.
        ''')
        add_vertical_space(2)
        st.write("Powered by LangChain, FAISS, and Google Gemini Pro")
        add_vertical_space(1)
        st.info("💡 *Tip:* Upload multiple papers for comprehensive cross-document insights!")

    st.title("📚 Research Paper Q&A System")
    st.markdown("Upload your PDFs and ask specific questions to get direct answers from your research papers.")

    uploaded_pdfs = st.file_uploader(
        "Upload your research paper PDFs here",
        type='pdf',
        accept_multiple_files=True,
        help="You can upload one or multiple PDF research papers."
    )

    vectorstore = None

    if uploaded_pdfs:
        all_pdf_names = sorted([pdf.name for pdf in uploaded_pdfs])
        processing_logic_version = "v1.1"
        store_hash_input = "".join(all_pdf_names) + processing_logic_version
        store_hash = hashlib.md5(store_hash_input.encode()).hexdigest()
        store_name = f"research_vs_{store_hash}"

        if "vectorstore" not in st.session_state or st.session_state.get("store_name") != store_name:
            st.session_state.downloaded_citations = {}

            with st.spinner("Processing papers and building knowledge base... This might take a moment if downloading citations..."):
                all_chunks_with_metadata = []
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1000,
                    chunk_overlap=200,
                    separators=["\n\n", "\n", ".", " ", ""]
                )

                #1
                for pdf_idx, uploaded_pdf in enumerate(uploaded_pdfs):
                    uploaded_pdf.seek(0)
                    text = parse_pdf(uploaded_pdf)
                    chunks = text_splitter.split_text(text)
                    for chunk in chunks:
                        all_chunks_with_metadata.append(
                            Document(page_content=chunk, metadata={"source": uploaded_pdf.name})
                        )

                    references = extract_references(text)
                    for ref in references:
                        if ref not in st.session_state.downloaded_citations:
                            pdf_file_obj = download_pdf_from_reference(ref)
                            if pdf_file_obj:
                                st.session_state.downloaded_citations[ref] = pdf_file_obj
                                # st.markdown(f"<small>Downloaded citation: {ref}</small>", unsafe_allow_html=True) # Optional visual feedback

                #2

                if st.session_state.downloaded_citations:
                    st.markdown("<small>Processing downloaded citations...</small>", unsafe_allow_html=True)
                    for ref, pdf_obj in st.session_state.downloaded_citations.items():
                        pdf_obj.seek(0)
                        text = parse_pdf(pdf_obj)
                        chunks = text_splitter.split_text(text)
                        for chunk in chunks:
                            all_chunks_with_metadata.append(
                                Document(page_content=chunk, metadata={"source": ref})
                            )

                embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

                pickle_path = f"{store_name}.pkl"
                if os.path.exists(pickle_path):
                    with open(pickle_path, "rb") as f:
                        vectorstore = pickle.load(f)
                    st.session_state.vectorstore = vectorstore
                    st.session_state.store_name = store_name
                else:
                    if all_chunks_with_metadata:
                        vectorstore = FAISS.from_documents(all_chunks_with_metadata, embedding=embeddings)
                        with open(pickle_path, "wb") as f:
                            pickle.dump(vectorstore, f)
                        st.session_state.vectorstore = vectorstore
                        st.session_state.store_name = store_name
                    else:
                        st.warning("No text extracted from PDFs to build knowledge base.")
                        return

            st.info(f"✅ Ready! Knowledge base built from *{len(uploaded_pdfs)}* uploaded papers and *{len(st.session_state.downloaded_citations)}* cited papers.")
        else:
            vectorstore = st.session_state.vectorstore
            st.info(f"✅ Knowledge base for *{len(uploaded_pdfs)}* uploaded papers and *{len(st.session_state.downloaded_citations)}* cited papers loaded.")

        st.markdown("---")


        st.subheader("❓ Ask Your Research Question")
        query = st.text_area(
            "Type your question here (e.g., 'What are the main findings of the paper?', 'How is X measured?', 'Compare methodology A and B')",
            height=80,
            placeholder="Enter your question about the uploaded papers..."
        )

        query_button = st.button("Get Answer", type="primary", use_container_width=True)

        if query_button and query:
            if vectorstore:
                with st.spinner("Searching papers and synthesizing answer..."):

                    retrieved_results = vectorstore.similarity_search_with_score(query, k=8)

                    context_for_llm = "\n\n".join([doc.page_content for doc, score in retrieved_results])

                    if context_for_llm:
                        answer = get_answer_from_gemini(context_for_llm, query)
                    else:
                        answer = "I'm sorry, I couldn't find any relevant information in the uploaded papers to answer your question. The context was empty."


                    st.markdown("### ✨ AI-Generated Answer")
                    st.write(answer)

                    st.markdown("---")
                    st.subheader("🔎 Supporting Snippets from Documents")
                    if retrieved_results:
                        for i, (doc, score) in enumerate(retrieved_results):

                            with st.expander(f"Snippet {i+1} from: **{doc.metadata.get('source', 'Unknown')}** (Relevance: {score:.2f})"):
                                st.code(doc.page_content, language="text")
                    else:
                        st.info("No relevant snippets found for your question.")

            else:
                st.warning("Please upload PDFs first to build the knowledge base.")
        elif query_button and not query:
            st.warning("Please enter a question to get an answer.")
    else:
        st.info("Upload research papers above to start analyzing their content.")

    st.markdown("---")

    if "downloaded_citations" in st.session_state and st.session_state.downloaded_citations:
        with st.expander("📂 Downloaded Citation Papers (for reference)"):
            if st.session_state.downloaded_citations:
                for ref, file_obj in st.session_state.downloaded_citations.items():
                    st.markdown(f"*Reference:* {ref}")

                    file_obj.seek(0)
                    st.download_button(label=f"Download {ref}",
                                      data=file_obj.getvalue(),
                                      file_name=f"{ref.replace('/', '_').replace(':', '-')}.pdf",
                                      mime="application/pdf")
            else:
                st.info("No citation papers were downloaded yet.")


    st.markdown("---")
    st.caption("This system aims to provide answers directly from your uploaded research papers and their extracted citations. It does not provide real-time information or engage in general conversation outside the document context.")

if _name_ == "_main_":
    main()