Skip to content

Commit

Permalink
Merge pull request #687 from 0x11c11e/feature/add-semantic-scholar
Browse files Browse the repository at this point in the history
Feature/add semantic scholar
  • Loading branch information
assafelovic authored Jul 28, 2024
2 parents fbff073 + fcd3504 commit 200ac43
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 4 deletions.
35 changes: 32 additions & 3 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,9 +1,38 @@
# .env.example

# OpenAI API key for accessing OpenAI's GPT models
OPENAI_API_KEY=

# API key for accessing Tavily's services
TAVILY_API_KEY=

# API key for accessing LangChain's services
LANGCHAIN_API_KEY=

# Path to the directory where documents are stored
DOC_PATH=./my-docs

# the name of the embedding model to use for Ollama
# The name of the embedding model to use for Ollama
OLLAMA_EMBEDDING_MODEL=
# the Ollama endpoint to use
OLLAMA_BASE_URL=

# The Ollama endpoint to use
OLLAMA_BASE_URL=

# Choose one of the available retrievers by uncommenting the desired retriever:
# RETRIEVER=arxiv
# RETRIEVER=bing
# RETRIEVER=custom
# RETRIEVER=duckduckgo
# RETRIEVER=exa
# RETRIEVER=google
# RETRIEVER=searx
# RETRIEVER=semantic_scholar
# RETRIEVER=serpapi
# RETRIEVER=serper
# RETRIEVER=tavily

# Example setting for retriever
RETRIEVER=tavily

# Maximum number of search results to return per query
MAX_SEARCH_RESULTS_PER_QUERY=5
4 changes: 4 additions & 0 deletions gpt_researcher/master/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ def get_retriever(retriever):
from gpt_researcher.retrievers import ExaSearch

retriever = ExaSearch
case "semantic_scholar":
from gpt_researcher.retrievers import SemanticScholarSearch

retriever = SemanticScholarSearch
case "custom":
from gpt_researcher.retrievers import CustomRetriever

Expand Down
4 changes: 3 additions & 1 deletion gpt_researcher/retrievers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .serpapi.serpapi import SerpApiSearch
from .serper.serper import SerperSearch
from .tavily.tavily_search import TavilySearch
from .semantic_scholar.semantic_scholar import SemanticScholarSearch

__all__ = [
"TavilySearch",
Expand All @@ -17,5 +18,6 @@
"GoogleSearch",
"SearxSearch",
"BingSearch",
"ArxivSearch"
"ArxivSearch",
"SemanticScholarSearch",
]
Empty file.
59 changes: 59 additions & 0 deletions gpt_researcher/retrievers/semantic_scholar/semantic_scholar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import Dict, List

import requests


class SemanticScholarSearch:
"""
Semantic Scholar API Retriever
"""

BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
VALID_SORT_CRITERIA = ["relevance", "citationCount", "publicationDate"]

def __init__(self, query: str, sort: str = "relevance"):
"""
Initialize the SemanticScholarSearch class with a query and sort criterion.
:param query: Search query string
:param sort: Sort criterion ('relevance', 'citationCount', 'publicationDate')
"""
self.query = query
assert sort in self.VALID_SORT_CRITERIA, "Invalid sort criterion"
self.sort = sort.lower()

def search(self, max_results: int = 20) -> List[Dict[str, str]]:
"""
Perform the search on Semantic Scholar and return results.
:param max_results: Maximum number of results to retrieve
:return: List of dictionaries containing title, href, and body of each paper
"""
params = {
"query": self.query,
"limit": max_results,
"fields": "title,abstract,url,venue,year,authors,isOpenAccess,openAccessPdf",
"sort": self.sort,
}

try:
response = requests.get(self.BASE_URL, params=params)
response.raise_for_status()
except requests.RequestException as e:
print(f"An error occurred while accessing Semantic Scholar API: {e}")
return []

results = response.json().get("data", [])
search_result = []

for result in results:
if result.get("isOpenAccess") and result.get("openAccessPdf"):
search_result.append(
{
"title": result.get("title", "No Title"),
"href": result["openAccessPdf"].get("url", "No URL"),
"body": result.get("abstract", "Abstract not available"),
}
)

return search_result

0 comments on commit 200ac43

Please sign in to comment.