diff --git a/DocSum/ui/gradio/docsum_ui_gradio.py b/DocSum/ui/gradio/docsum_ui_gradio.py index e9b9b5099..e1bc802c9 100644 --- a/DocSum/ui/gradio/docsum_ui_gradio.py +++ b/DocSum/ui/gradio/docsum_ui_gradio.py @@ -6,12 +6,13 @@ import json import logging import os +from urllib.parse import urlparse import gradio as gr import requests import uvicorn from fastapi import FastAPI -from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader +from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader, UnstructuredURLLoader # Configure logging logging.basicConfig(level=logging.INFO) @@ -91,6 +92,42 @@ def read_video_file(self, file): base64_str = self.encode_file_to_base64(file) return self.generate_summary(base64_str, document_type="video") + def is_valid_url(self, url): + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + def read_url(self, url): + """Read and process the content of a url. + + Args: + url: The url to be read as a document. + + Returns: + str: The content of the website or an error message if the url is unsupported. + """ + + self.page_content = "" + + logger.info(">>> Reading url: %s", url) + if self.is_valid_url(url=url): + os.environ["no_proxy"] = f"{os.environ.get('no_proxy', '')},{url}".strip(",") + try: + loader = UnstructuredURLLoader([url]) + page = loader.load() + self.page_content = [content.page_content for content in page][0] + except Exception as e: + msg = f"There was an error trying to read '{url}' --> '{e}'\nTry adding the domain name to your `no_proxy` variable and try again. Example: example.com*" + logger.error(msg) + else: + msg = f"Invalid URL '{url}'. Make sure the link provided is a valid URL.url" + logger.error(msg) + return msg + + return self.page_content + def generate_summary(self, doc_content, document_type="text"): """Generate a summary for the given document content. @@ -152,6 +189,7 @@ def generate_summary(self, doc_content, document_type="text"): except requests.exceptions.RequestException as e: logger.error("Request exception: %s", e) + print("") return str(e) return str(response.status_code) @@ -201,6 +239,25 @@ def render(self): ) submit_btn.click(fn=self.generate_summary, inputs=[input_text], outputs=[generated_text]) + with gr.Blocks() as url_ui: + # URL text UI + with gr.Row(): + with gr.Column(): + input_text = gr.TextArea( + label="Please paste a URL for summarization", + placeholder="Paste a URL for the information you need to summarize", + ) + submit_btn = gr.Button("Generate Summary") + with gr.Column(): + generated_text = gr.TextArea( + label="Text Summary", placeholder="Summarized text will be displayed here" + ) + submit_btn.click( + lambda input_text: self.generate_summary(self.read_url(input_text)), + inputs=input_text, + outputs=generated_text, + ) + # File Upload UI file_ui = self.create_upload_ui( label="Please upload a document (.pdf, .doc, .docx)", @@ -232,6 +289,8 @@ def render(self): audio_ui.render() with gr.TabItem("Upload Video"): video_ui.render() + with gr.TabItem("Enter URL"): + url_ui.render() return self.demo diff --git a/DocSum/ui/gradio/requirements.txt b/DocSum/ui/gradio/requirements.txt index e11520ca5..9086603d0 100644 --- a/DocSum/ui/gradio/requirements.txt +++ b/DocSum/ui/gradio/requirements.txt @@ -6,3 +6,4 @@ numpy==1.26.4 opencv-python==4.10.0.82 Pillow==10.3.0 pypdf +unstructured