-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
crawler_valor #85
base: master
Are you sure you want to change the base?
crawler_valor #85
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
# -*- coding: utf-8 -*- | ||
from goose import Goose | ||
import pymongo | ||
from bs4 import BeautifulSoup | ||
import requests | ||
import re | ||
import pandas as pd | ||
import datetime | ||
import zlib | ||
import cPickle as CP | ||
import cld | ||
import sys | ||
from requests.exceptions import ConnectionError, MissingSchema, Timeout | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
import bson | ||
import settings | ||
import logging_mc | ||
|
||
logger = logging_mc.get_logger( 'valor' ) | ||
|
||
|
||
client = pymongo.MongoClient(settings.MONGOHOST, 27017) | ||
MCDB = client.MCDB | ||
ARTICLES = MCDB.articles # Article Collection | ||
ARTICLES.ensure_index("source") | ||
|
||
def find_articles(): | ||
""" | ||
Get the urls of last news | ||
:return: last news' urls of all categories | ||
:rtype: set() | ||
""" | ||
urls = ['http://www.valor.com.br/ultimas-noticias/brasil', | ||
'http://www.valor.com.br/ultimas-noticias/politica', | ||
'http://www.valor.com.br/ultimas-noticias/financas', | ||
'http://www.valor.com.br/ultimas-noticias/empresas', | ||
'http://www.valor.com.br/ultimas-noticias/agro', | ||
'http://www.valor.com.br/ultimas-noticias/internacional', | ||
'http://www.valor.com.br/ultimas-noticias/opiniao', | ||
'http://www.valor.com.br/ultimas-noticias/legislacao', | ||
'http://www.valor.com.br/ultimas-noticias/carreira', | ||
'http://www.valor.com.br/ultimas-noticias/cultura'] | ||
news_urls = list() | ||
for INDEX_URL in urls: | ||
index = requests.get(INDEX_URL).content | ||
soup = BeautifulSoup(index, "lxml") | ||
news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2') | ||
news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup( art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be a good idea to break this line to make it more readable. PEP8 suggests 72 with a maximum of 79. I like to follow that whenever possible. |
||
return set(news_urls ) | ||
|
||
def get_published_time(soup): | ||
""" | ||
Get the news' published datetime | ||
:param soup: object with news html page | ||
:type soup: BeautifulSoup object | ||
:return: news published datetime | ||
:rtype: string | ||
""" | ||
try: | ||
time_tag = soup.find(id="content-area").find_all('span', class_='date submitted')[0].text | ||
except IndexError: | ||
logger.error('wrong time tag') | ||
return None | ||
if time_tag is None: | ||
return None | ||
else: | ||
try: | ||
published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M') | ||
except ValueError: | ||
logger.error('wrong date extraction') | ||
return None | ||
return published_time | ||
|
||
def extract_title(article): | ||
""" | ||
Extract the news title. | ||
""" | ||
|
||
try: | ||
title = article.title | ||
except Exception as ex: | ||
template = "An exception of type {0} occured during extraction of news title. Arguments:\n{1!r}" | ||
message = template.format(type(ex).__name__, ex.args) | ||
logger.exception(message) | ||
return None | ||
return title | ||
|
||
def extract_content(article): | ||
""" | ||
Extract relevant information about news page | ||
""" | ||
|
||
try: | ||
body_content = article.cleaned_text | ||
except Exception as ex: | ||
template = "An exception of type {0} occured during extraction of news content. Arguments:\n{1!r}" | ||
message = template.format(type(ex).__name__, ex.args) | ||
logger.exception(message) | ||
return None | ||
return body_content | ||
|
||
def detect_language(text): | ||
""" | ||
Detect the language of text using chromium_compact_language_detector | ||
:param text: text to be analyzed | ||
:return: {"name": portuguese, "pt"} | ||
""" | ||
name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8')) | ||
return {"name": name, "code": code} | ||
|
||
def compress_content(html): | ||
""" | ||
Compresses and encodes html content so that it can be BSON encoded an store in mongodb | ||
:param html: original html document | ||
:return: compressed an b64 encoded document | ||
""" | ||
pickled = CP.dumps(html, CP.HIGHEST_PROTOCOL) | ||
squished = zlib.compress(pickled) | ||
encoded = bson.Binary(squished) # b64.urlsafe_b64encode(squished) | ||
return encoded | ||
|
||
def decompress_content(compressed_html): | ||
""" | ||
Decompress data compressed by `compress_content` | ||
:param compressed_html: compressed html document | ||
:return: original html | ||
""" | ||
# unencoded = b64.urlsafe_b64decode(str(compressed_html)) | ||
decompressed = zlib.decompress(compressed_html) | ||
orig_html = CP.loads(decompressed) | ||
return orig_html | ||
|
||
|
||
def download_article(url): | ||
""" | ||
Download the html content of a news page | ||
:param url: news page's url | ||
:type url: string | ||
:return: news page's content | ||
:rtype: requests.models.Response | ||
""" | ||
article = { | ||
'link': url, | ||
'source': 'crawler_Valor', | ||
} | ||
logger.info("Downloading article: %s", url) | ||
try: | ||
response = requests.get(url, timeout=30) | ||
except ConnectionError: | ||
logger.error("Failed to fetch %s", url) | ||
return | ||
except Timeout: | ||
logger.error("Timed out while fetching %s", url) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd use new style string formatting here, just to keep it consistent and make the move to python3 easier later on. |
||
return | ||
|
||
encoding = response.encoding if response.encoding is not None else 'utf8' | ||
dec_content = response.content.decode(encoding) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that if you use |
||
soup = BeautifulSoup(dec_content, "lxml") | ||
extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) | ||
news = extractor.extract(url=url) | ||
|
||
article['link_content'] = compress_content(dec_content) | ||
article['compressed'] = True | ||
article['language'] = detect_language(dec_content) | ||
article['title'] = extract_title(news) | ||
article['published'] = get_published_time(soup) | ||
article['main_text'] = extract_content(news) | ||
|
||
return article | ||
|
||
if __name__ =='__main__': | ||
for url in find_articles(): | ||
print url | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's a better idea to use the logger here instead of |
||
exists = list(ARTICLES.find({"link": url})) | ||
if not exists: | ||
article = download_article(url) | ||
print 'download done' | ||
ARTICLES.insert(article, w=1) | ||
print 'salved' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Small typo here: it should say |
||
else: | ||
print 'it already exists' | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd also remove this trailing empty lines. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import logging | ||
from logging.handlers import RotatingFileHandler | ||
|
||
def get_logger( source ): | ||
""" | ||
Responsable for save logs of operations | ||
:return: logger configured based on source | ||
:rtype: logging.getLogger( source) | ||
|
||
""" | ||
|
||
|
||
logger = logging.getLogger(source) | ||
logger.setLevel(logging.DEBUG) | ||
|
||
# create stream handler and set level to debug | ||
stream_handler = logging.StreamHandler() | ||
stream_handler.setLevel(logging.DEBUG) | ||
file_handler = RotatingFileHandler( '/tmp/mediacloud_{0}.log'.format(source), maxBytes=5e6, backupCount=3) | ||
|
||
# create formatter | ||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | ||
|
||
# add formatter to stream_handler | ||
stream_handler.setFormatter(formatter) | ||
file_handler.setFormatter(formatter) | ||
|
||
# add stream_handler to logger | ||
logger.addHandler(stream_handler) # uncomment for console output of messages | ||
logger.addHandler(file_handler) | ||
|
||
return logger |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You don't need to import
re
norpandas
. It's a good idea to remove these imports.