Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding URL summary option to DocSum Gradio-UI #1248

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
61 changes: 60 additions & 1 deletion DocSum/ui/gradio/docsum_ui_gradio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import json
import logging
import os
from urllib.parse import urlparse

import gradio as gr
import requests
import uvicorn
from fastapi import FastAPI
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader, UnstructuredURLLoader

# Configure logging
logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -91,6 +92,42 @@ def read_video_file(self, file):
base64_str = self.encode_file_to_base64(file)
return self.generate_summary(base64_str, document_type="video")

def is_valid_url(self, url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False

def read_url(self, url):
"""Read and process the content of a url.

Args:
url: The url to be read as a document.

Returns:
str: The content of the website or an error message if the url is unsupported.
"""

self.page_content = ""

logger.info(">>> Reading url: %s", url)
if self.is_valid_url(url=url):
os.environ["no_proxy"] = f"{os.environ.get('no_proxy', '')},{url}".strip(",")
try:
loader = UnstructuredURLLoader([url])
page = loader.load()
self.page_content = [content.page_content for content in page][0]
except Exception as e:
msg = f"There was an error trying to read '{url}' --> '{e}'\nTry adding the domain name to your `no_proxy` variable and try again. Example: example.com*"
logger.error(msg)
else:
msg = f"Invalid URL '{url}'. Make sure the link provided is a valid URL.url"
logger.error(msg)
return msg

return self.page_content

def generate_summary(self, doc_content, document_type="text"):
"""Generate a summary for the given document content.

Expand Down Expand Up @@ -152,6 +189,7 @@ def generate_summary(self, doc_content, document_type="text"):

except requests.exceptions.RequestException as e:
logger.error("Request exception: %s", e)
print("")
return str(e)

return str(response.status_code)
Expand Down Expand Up @@ -201,6 +239,25 @@ def render(self):
)
submit_btn.click(fn=self.generate_summary, inputs=[input_text], outputs=[generated_text])

with gr.Blocks() as url_ui:
# URL text UI
with gr.Row():
with gr.Column():
input_text = gr.TextArea(
label="Please paste a URL for summarization",
placeholder="Paste a URL for the information you need to summarize",
)
submit_btn = gr.Button("Generate Summary")
with gr.Column():
generated_text = gr.TextArea(
label="Text Summary", placeholder="Summarized text will be displayed here"
)
submit_btn.click(
lambda input_text: self.generate_summary(self.read_url(input_text)),
inputs=input_text,
outputs=generated_text,
)

# File Upload UI
file_ui = self.create_upload_ui(
label="Please upload a document (.pdf, .doc, .docx)",
Expand Down Expand Up @@ -232,6 +289,8 @@ def render(self):
audio_ui.render()
with gr.TabItem("Upload Video"):
video_ui.render()
with gr.TabItem("Enter URL"):
url_ui.render()

return self.demo

Expand Down
1 change: 1 addition & 0 deletions DocSum/ui/gradio/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ numpy==1.26.4
opencv-python==4.10.0.82
Pillow==10.3.0
pypdf
unstructured
Loading