Merge pull request #687 from 0x11c11e/feature/add-semantic-scholar

Feature/add semantic scholar
assafelovic · Jul 28, 2024 · 200ac43 · 200ac43
2 parents fbff073 + fcd3504
commit 200ac43
Show file tree

Hide file tree

Showing 5 changed files with 98 additions and 4 deletions.
diff --git a/.env.example b/.env.example
@@ -1,9 +1,38 @@
+# .env.example
+
+# OpenAI API key for accessing OpenAI's GPT models
 OPENAI_API_KEY=
+
+# API key for accessing Tavily's services
 TAVILY_API_KEY=
+
+# API key for accessing LangChain's services
 LANGCHAIN_API_KEY=
+
+# Path to the directory where documents are stored
 DOC_PATH=./my-docs
 
-# the name of the embedding model to use for Ollama
+# The name of the embedding model to use for Ollama
 OLLAMA_EMBEDDING_MODEL=
-# the Ollama endpoint to use
-OLLAMA_BASE_URL=
+
+# The Ollama endpoint to use
+OLLAMA_BASE_URL=
+
+# Choose one of the available retrievers by uncommenting the desired retriever:
+# RETRIEVER=arxiv
+# RETRIEVER=bing
+# RETRIEVER=custom
+# RETRIEVER=duckduckgo
+# RETRIEVER=exa
+# RETRIEVER=google
+# RETRIEVER=searx
+# RETRIEVER=semantic_scholar
+# RETRIEVER=serpapi
+# RETRIEVER=serper
+# RETRIEVER=tavily
+
+# Example setting for retriever
+RETRIEVER=tavily
+
+# Maximum number of search results to return per query
+MAX_SEARCH_RESULTS_PER_QUERY=5
diff --git a/gpt_researcher/master/actions.py b/gpt_researcher/master/actions.py
@@ -58,6 +58,10 @@ def get_retriever(retriever):
             from gpt_researcher.retrievers import ExaSearch
 
             retriever = ExaSearch
+        case "semantic_scholar":
+            from gpt_researcher.retrievers import SemanticScholarSearch
+
+            retriever = SemanticScholarSearch
         case "custom":
             from gpt_researcher.retrievers import CustomRetriever
 

diff --git a/gpt_researcher/retrievers/__init__.py b/gpt_researcher/retrievers/__init__.py
@@ -7,6 +7,7 @@
 from .serpapi.serpapi import SerpApiSearch
 from .serper.serper import SerperSearch
 from .tavily.tavily_search import TavilySearch
+from .semantic_scholar.semantic_scholar import SemanticScholarSearch
 
 __all__ = [
     "TavilySearch",
@@ -17,5 +18,6 @@
     "GoogleSearch",
     "SearxSearch",
     "BingSearch",
-    "ArxivSearch"
+    "ArxivSearch",
+    "SemanticScholarSearch",
 ]
diff --git a/gpt_researcher/retrievers/semantic_scholar/__init__.py b/gpt_researcher/retrievers/semantic_scholar/__init__.py
diff --git a/gpt_researcher/retrievers/semantic_scholar/semantic_scholar.py b/gpt_researcher/retrievers/semantic_scholar/semantic_scholar.py
@@ -0,0 +1,59 @@
+from typing import Dict, List
+
+import requests
+
+
+class SemanticScholarSearch:
+    """
+    Semantic Scholar API Retriever
+    """
+
+    BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
+    VALID_SORT_CRITERIA = ["relevance", "citationCount", "publicationDate"]
+
+    def __init__(self, query: str, sort: str = "relevance"):
+        """
+        Initialize the SemanticScholarSearch class with a query and sort criterion.
+
+        :param query: Search query string
+        :param sort: Sort criterion ('relevance', 'citationCount', 'publicationDate')
+        """
+        self.query = query
+        assert sort in self.VALID_SORT_CRITERIA, "Invalid sort criterion"
+        self.sort = sort.lower()
+
+    def search(self, max_results: int = 20) -> List[Dict[str, str]]:
+        """
+        Perform the search on Semantic Scholar and return results.
+
+        :param max_results: Maximum number of results to retrieve
+        :return: List of dictionaries containing title, href, and body of each paper
+        """
+        params = {
+            "query": self.query,
+            "limit": max_results,
+            "fields": "title,abstract,url,venue,year,authors,isOpenAccess,openAccessPdf",
+            "sort": self.sort,
+        }
+
+        try:
+            response = requests.get(self.BASE_URL, params=params)
+            response.raise_for_status()
+        except requests.RequestException as e:
+            print(f"An error occurred while accessing Semantic Scholar API: {e}")
+            return []
+
+        results = response.json().get("data", [])
+        search_result = []
+
+        for result in results:
+            if result.get("isOpenAccess") and result.get("openAccessPdf"):
+                search_result.append(
+                    {
+                        "title": result.get("title", "No Title"),
+                        "href": result["openAccessPdf"].get("url", "No URL"),
+                        "body": result.get("abstract", "Abstract not available"),
+                    }
+                )
+
+        return search_result