WEB_MASTER/app.py at main · balaji1233/WEB_MASTER · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import streamlit as st
import os
import asyncio
import nest_asyncio
import sys
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
from langchain.chains import RetrievalQA
from streamlit_chat import message as st_message

from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig

# Set Windows event loop policy
if sys.platform == "win32":
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

load_dotenv()
nest_asyncio.apply()

# ---------------------------
# Initialize Session State Variables
# ---------------------------
if "url_submitted" not in st.session_state:
    st.session_state.url_submitted = False
if "extraction_done" not in st.session_state:
    st.session_state.extraction_done = False
if "extracted_text" not in st.session_state:
    st.session_state.extracted_text = ""
if "embedding_done" not in st.session_state:
    st.session_state.embedding_done = False
if "vectorstore" not in st.session_state:
    st.session_state.vectorstore = None
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []
if "summary" not in st.session_state:
    st.session_state.summary = ""

# ---------------------------
# Page Config
# ---------------------------
st.set_page_config(layout="wide", page_title="WebMaster")
st.title("Project WebMaster")

page = st.sidebar.selectbox("Navigation", ["Home", "AI Engine", "Contact"])

if page == "Home":
    st.markdown("""
    ## Welcome to WebMaster
    **WebMaster** is a cutting-edge RAG Chatbot application that allows you to extract content from any URL, generate detailed summaries, and interact with the content using advanced language models.
    With options to choose between **Closed Source** (OpenAI) and **Open Source** (Ollama) engines for both summarization and conversation, WebMaster gives you the flexibility to explore and deploy the best AI solutions for your needs.

    **Features:**
    - **Website Extraction:** Crawl and extract web page content.
    - **Summarization:** Generate detailed summaries of the extracted content.
    - **Embeddings & Retrieval:** Create embeddings with FAISS for intelligent document retrieval.
    - **Chatbot Interface:** Interact with your content via a conversational agent.

    Get started by selecting **AI Engine** from the sidebar.
    """)

elif page == "AI Engine":

    # ---------------------------
    # URL Input Form
    # ---------------------------
    with st.form("url_form"):
        url_input = st.text_input("Enter a URL to crawl:")
        submit_url = st.form_submit_button("Submit URL")
        if submit_url and url_input:
            st.session_state.url_submitted = True
            # Reset any previous state
            st.session_state.extraction_done = False
            st.session_state.embedding_done = False
            st.session_state.chat_history = []
            st.session_state.summary = ""

    # ---------------------------
    # If URL has been submitted, divide layout into three columns
    # ---------------------------
    if st.session_state.url_submitted:
        col1, col2, col3 = st.columns(3)

        # ---------------------------
        # Column 1: Website Extraction & Summarization using crawl4ai
        # ---------------------------
        with col1:
            st.header("1. Website Extraction")
            if not st.session_state.extraction_done:
                with st.spinner("Extracting website..."):
                    # Define async crawl function (returns markdown output)
                    async def simple_crawl(url):
                        crawler_run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
                        async with AsyncWebCrawler() as crawler:
                            result = await crawler.arun(url=url, config=crawler_run_config)
                            return result.markdown

                    # Run the async crawl (nest_asyncio makes this safe in Streamlit)
                    extracted = asyncio.run(simple_crawl(url_input))
                    st.session_state.extracted_text = extracted
                    st.session_state.extraction_done = True
                st.success("Extraction complete!")

            # Show a preview (first few non-empty lines)
            preview = "\n".join(
                [line for line in st.session_state.extracted_text.splitlines() if line.strip()][:5]
            )
            st.text_area("Extracted Text Preview", preview, height=150)

            # Save the full extracted text as a file and provide a download button.
            st.download_button(
                label="Download Extracted Text",
                data=st.session_state.extracted_text,
                file_name="extracted_text.txt",
                mime="text/plain",
            )

            st.markdown("---")
            st.subheader("Summarize Web Page")
            if st.button("Summarize Web Page", key="summarize_button"):
                with st.spinner("Summarizing..."):
                    summary_prompt_template = """
    You are an AI assistant that is tasked with summarizing a web page.
    Your summary should be detailed and cover all key points mentioned in the web page.
    Below is the extracted content of the web page:
    {content}

    Please provide a comprehensive and detailed summary in Markdown format.
    """
                    summary_prompt = PromptTemplate(template=summary_prompt_template, input_variables=["content"])
                    prompt_text = summary_prompt.format(content=st.session_state.extracted_text)
                    summarizer = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.3, max_tokens=1500)
                    summary_response = summarizer(prompt_text)
                    st.session_state.summary = summary_response.content
                st.success("Summarization complete!")

            if st.session_state.summary:
                st.subheader("Summarized Output")
                st.markdown(st.session_state.summary, unsafe_allow_html=False)

        # ---------------------------
        # Column 2: Creating Embeddings with FAISS
        # ---------------------------
        with col2:
            st.header("2. Create Embeddings")
            if st.session_state.extraction_done and not st.session_state.embedding_done:
                if st.button("Create Embeddings"):
                    with st.spinner("Creating embeddings..."):
                        # Save extracted text to a markdown file (output.md)
                        with open("output.md", "w", encoding="utf-8") as f:
                            f.write(st.session_state.extracted_text)

                        # Load the markdown file using UnstructuredMarkdownLoader
                        loader = UnstructuredMarkdownLoader("output.md")
                        data = loader.load()

                        # Split the text into chunks using RecursiveCharacterTextSplitter
                        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
                        texts = text_splitter.split_documents(data)

                        # Set your OpenAI API key (using environment variable)
                        os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

                        # Create embeddings using OpenAIEmbeddings
                        embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

                        # Build a FAISS vectorstore from the documents
                        vectorstore = FAISS.from_documents(texts, embeddings)

                        # Persist the vectorstore locally (optional)
                        vectorstore.save_local("faiss_index")

                        st.session_state.vectorstore = vectorstore
                        st.session_state.embedding_done = True
                    st.success("Vectors are created!")
            elif st.session_state.embedding_done:
                st.info("Embeddings have been created.")

        # ---------------------------
        # Column 3: Chatbot using streamlit_chat and a Retrieval Chain
        # ---------------------------
        with col3:
            st.header("3. Chat with the Bot")
            if st.session_state.embedding_done:
                # Let the user select the LLM type
                llm_choice = st.radio("Select LLM Type", ("Closed Source", "Open Source"), index=0, key="llm_choice")

                # Setup retrieval-based QA chain using the vectorstore
                vectorstore = st.session_state.vectorstore
                retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
                prompt_template = """
    You are an AI assistant tasked with answering questions based solely
    on the provided context. Your goal is to generate a comprehensive answer
    for the given question using only the information available in the context.

    context: {context}

    question: {question}

    <response> Your answer in Markdown format. </response>
    """
                prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
                chain_type_kwargs = {"prompt": prompt}

                # Initialize the appropriate LLM based on selection
                if llm_choice == "Closed Source":
                    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.3, max_tokens=1000)
                else:
                    llm = ChatOllama(model="deepseek-r1:1.5b", base_url="http://localhost:11434", temperature=0.3)

                qa = RetrievalQA.from_chain_type(
                    llm=llm,
                    chain_type="stuff",
                    retriever=retriever,
                    return_source_documents=True,
                    chain_type_kwargs=chain_type_kwargs,
                    verbose=True,
                )

                # Chat interface using streamlit_chat
                user_input = st.text_input("Your Message:", key="chat_input")
                if st.button("Send", key="send_button") and user_input:
                    response = qa(user_input)
                    bot_answer = response["result"]
                    st.session_state.chat_history.append({"user": user_input, "bot": bot_answer})

                    # Save the chat history to a file (chat_history.txt) for this session
                    chat_file_content = "\n\n".join(
                        [f"User: {chat['user']}\nBot: {chat['bot']}" for chat in st.session_state.chat_history]
                    )
                    with open("chat_history.txt", "w", encoding="utf-8") as cf:
                        cf.write(chat_file_content)

                # Display the conversation using streamlit_chat component
                if st.session_state.chat_history:
                    for chat in st.session_state.chat_history:
                        st_message(chat["user"], is_user=True)
                        st_message(chat["bot"], is_user=False)
            else:
                st.info("Please create embeddings to activate the chat.")