From b2704019daf7102252e24099dce08c2e1a250dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= Date: Sat, 25 Jul 2015 01:05:43 -0300 Subject: [PATCH 1/2] crawler for valor --- capture/crawler_valor.py | 222 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 capture/crawler_valor.py diff --git a/capture/crawler_valor.py b/capture/crawler_valor.py new file mode 100644 index 0000000..16cd714 --- /dev/null +++ b/capture/crawler_valor.py @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- +from logging.handlers import RotatingFileHandler +from goose import Goose +import pymongo +from bs4 import BeautifulSoup +import requests +import re +import pandas as pd +import logging +import datetime +import zlib +import cPickle as CP +import cld +import sys +from requests.exceptions import ConnectionError, MissingSchema, Timeout +from bson.errors import InvalidDocument +from pymongo.errors import DuplicateKeyError +import bson +from dateutil.parser import parse +import settings + + +########################### +# Setting up Logging +########################### +logger = logging.getLogger("Valor") +logger.setLevel(logging.DEBUG) +# create console handler and set level to debug +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +fh = RotatingFileHandler('/tmp/mediacloud_Valor.log', maxBytes=5e6, backupCount=3) +# create formatter +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +# add formatter to ch +ch.setFormatter(formatter) +fh.setFormatter(formatter) +# add ch to logger +#logger.addHandler(ch) # uncomment for console output of messages + +client = pymongo.MongoClient(settings.MONGOHOST, 27017) +MCDB = client.MCDB +ARTICLES = MCDB.articles # Article Collection +ARTICLES.ensure_index("source") + +def find_articles(): + """ + Get the urls of last news + :return: last news' urls of all categories + :rtype: set() + """ + urls = ['http://www.valor.com.br/ultimas-noticias/brasil', + 'http://www.valor.com.br/ultimas-noticias/politica', + 'http://www.valor.com.br/ultimas-noticias/financas', + 'http://www.valor.com.br/ultimas-noticias/empresas', + 'http://www.valor.com.br/ultimas-noticias/agro', + 'http://www.valor.com.br/ultimas-noticias/internacional', + 'http://www.valor.com.br/ultimas-noticias/opiniao', + 'http://www.valor.com.br/ultimas-noticias/legislacao', + 'http://www.valor.com.br/ultimas-noticias/carreira', + 'http://www.valor.com.br/ultimas-noticias/cultura'] + news_urls = list() + for INDEX_URL in urls: + index = requests.get(INDEX_URL).content + soup = BeautifulSoup(index, "lxml") + news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2') + news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup( art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index] + return set(news_urls ) + +def get_published_time(soup): + """ + Get the news' published datetime + :param soup: object with news html page + :type soup: BeautifulSoup object + :return: news published datetime + :rtype: string + """ + try: + time_tag = soup.find(id="content-area").find_all('span', class_='date submitted')[0].text + except IndexError: + logger.error('wrong time tag') + return None + if time_tag is None: + return None + else: + try: + published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M') + except ValueError: + logger.error('wrong date extraction') + return None + return published_time + +def extract_title(article): + """ + Extract the news title. + """ + + try: + title = article.title + except Exception as ex: + template = "An exception of type {0} occured during extraction of news title. Arguments:\n{1!r}" + message = template.format(type(ex).__name__, ex.args) + logger.exception(message) + return None + return title + +def extract_content(article): + """ + Extract relevant information about news page + """ + + try: + body_content = article.cleaned_text + except Exception as ex: + template = "An exception of type {0} occured during extraction of news content. Arguments:\n{1!r}" + message = template.format(type(ex).__name__, ex.args) + logger.exception(message) + return None + return body_content + +def detect_language(text): + """ + Detect the language of text using chromium_compact_language_detector + :param text: text to be analyzed + :return: {"name": portuguese, "pt"} + """ + name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8')) + return {"name": name, "code": code} + +def compress_content(html): + """ + Compresses and encodes html content so that it can be BSON encoded an store in mongodb + :param html: original html document + :return: compressed an b64 encoded document + """ + pickled = CP.dumps(html, CP.HIGHEST_PROTOCOL) + squished = zlib.compress(pickled) + encoded = bson.Binary(squished) # b64.urlsafe_b64encode(squished) + return encoded + +def decompress_content(compressed_html): + """ + Decompress data compressed by `compress_content` + :param compressed_html: compressed html document + :return: original html + """ + # unencoded = b64.urlsafe_b64decode(str(compressed_html)) + decompressed = zlib.decompress(compressed_html) + orig_html = CP.loads(decompressed) + return orig_html + + +def download_article(url): + """ + Download the html content of a news page + :param url: news page's url + :type url: string + :return: news page's content + :rtype: requests.models.Response + """ + article = { + 'link': url, + 'source': 'crawler_Valor', + } + logger.info("Downloading article: %s", url) + try: + response = requests.get(url, timeout=30) + except ConnectionError: + logger.error("Failed to fetch %s", url) + return + except Timeout: + logger.error("Timed out while fetching %s", url) + return + + encoding = response.encoding if response.encoding is not None else 'utf8' + dec_content = response.content.decode(encoding) + soup = BeautifulSoup(dec_content, "lxml") + extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) + news = extractor.extract(url=url) + + article['link_content'] = compress_content(dec_content) + article['compressed'] = True + article['language'] = detect_language(dec_content) + article['title'] = extract_title(news) + article['published'] = get_published_time(soup) + article['main_text'] = extract_content(news) + + return article + +if __name__ =='__main__': + for url in find_articles(): + print url + exists = list(ARTICLES.find({"link": url})) + if not exists: + article = download_article(url) + print 'download done' + ARTICLES.insert(article, w=1) + print 'salved' + else: + print 'it already exists' + + + + + + + + + + + + + + + + + + + + + + + From e1b8a518ee8ab1f3ce5735759babf1e94e2d684b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Venicius=20Gon=C3=A7alves?= Date: Sat, 25 Jul 2015 12:44:19 -0300 Subject: [PATCH 2/2] I've created logging_mc to configure logger. this way, we don't need to repeat this code in every crawler. --- capture/crawler_valor.py | 23 ++--------------------- capture/logging_mc.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 21 deletions(-) create mode 100644 capture/logging_mc.py diff --git a/capture/crawler_valor.py b/capture/crawler_valor.py index 16cd714..b28805a 100644 --- a/capture/crawler_valor.py +++ b/capture/crawler_valor.py @@ -1,41 +1,22 @@ # -*- coding: utf-8 -*- -from logging.handlers import RotatingFileHandler from goose import Goose import pymongo from bs4 import BeautifulSoup import requests import re import pandas as pd -import logging import datetime import zlib import cPickle as CP import cld import sys from requests.exceptions import ConnectionError, MissingSchema, Timeout -from bson.errors import InvalidDocument -from pymongo.errors import DuplicateKeyError import bson -from dateutil.parser import parse import settings +import logging_mc +logger = logging_mc.get_logger( 'valor' ) -########################### -# Setting up Logging -########################### -logger = logging.getLogger("Valor") -logger.setLevel(logging.DEBUG) -# create console handler and set level to debug -ch = logging.StreamHandler() -ch.setLevel(logging.DEBUG) -fh = RotatingFileHandler('/tmp/mediacloud_Valor.log', maxBytes=5e6, backupCount=3) -# create formatter -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') -# add formatter to ch -ch.setFormatter(formatter) -fh.setFormatter(formatter) -# add ch to logger -#logger.addHandler(ch) # uncomment for console output of messages client = pymongo.MongoClient(settings.MONGOHOST, 27017) MCDB = client.MCDB diff --git a/capture/logging_mc.py b/capture/logging_mc.py new file mode 100644 index 0000000..24db0a5 --- /dev/null +++ b/capture/logging_mc.py @@ -0,0 +1,32 @@ +import logging +from logging.handlers import RotatingFileHandler + +def get_logger( source ): + """ + Responsable for save logs of operations + :return: logger configured based on source + :rtype: logging.getLogger( source) + + """ + + + logger = logging.getLogger(source) + logger.setLevel(logging.DEBUG) + +# create stream handler and set level to debug + stream_handler = logging.StreamHandler() + stream_handler.setLevel(logging.DEBUG) + file_handler = RotatingFileHandler( '/tmp/mediacloud_{0}.log'.format(source), maxBytes=5e6, backupCount=3) + +# create formatter + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + +# add formatter to stream_handler + stream_handler.setFormatter(formatter) + file_handler.setFormatter(formatter) + +# add stream_handler to logger + logger.addHandler(stream_handler) # uncomment for console output of messages + logger.addHandler(file_handler) + + return logger