Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

crawler_valor #85

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 203 additions & 0 deletions capture/crawler_valor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# -*- coding: utf-8 -*-
from goose import Goose
import pymongo
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to import re nor pandas. It's a good idea to remove these imports.

import datetime
import zlib
import cPickle as CP
import cld
import sys
from requests.exceptions import ConnectionError, MissingSchema, Timeout
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sys and MissingSchema are also unnecessary imports.

import bson
import settings
import logging_mc

logger = logging_mc.get_logger( 'valor' )


client = pymongo.MongoClient(settings.MONGOHOST, 27017)
MCDB = client.MCDB
ARTICLES = MCDB.articles # Article Collection
ARTICLES.ensure_index("source")

def find_articles():
"""
Get the urls of last news
:return: last news' urls of all categories
:rtype: set()
"""
urls = ['http://www.valor.com.br/ultimas-noticias/brasil',
'http://www.valor.com.br/ultimas-noticias/politica',
'http://www.valor.com.br/ultimas-noticias/financas',
'http://www.valor.com.br/ultimas-noticias/empresas',
'http://www.valor.com.br/ultimas-noticias/agro',
'http://www.valor.com.br/ultimas-noticias/internacional',
'http://www.valor.com.br/ultimas-noticias/opiniao',
'http://www.valor.com.br/ultimas-noticias/legislacao',
'http://www.valor.com.br/ultimas-noticias/carreira',
'http://www.valor.com.br/ultimas-noticias/cultura']
news_urls = list()
for INDEX_URL in urls:
index = requests.get(INDEX_URL).content
soup = BeautifulSoup(index, "lxml")
news_index = soup.find(id="block-valor_capa_automatica-central_automatico").find_all('h2')
news_urls = news_urls + ['http://www.valor.com.br' + BeautifulSoup( art.encode('utf8') , "lxml" ).find('a').attrs['href'] for art in news_index]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be a good idea to break this line to make it more readable. PEP8 suggests 72 with a maximum of 79. I like to follow that whenever possible.

return set(news_urls )

def get_published_time(soup):
"""
Get the news' published datetime
:param soup: object with news html page
:type soup: BeautifulSoup object
:return: news published datetime
:rtype: string
"""
try:
time_tag = soup.find(id="content-area").find_all('span', class_='date submitted')[0].text
except IndexError:
logger.error('wrong time tag')
return None
if time_tag is None:
return None
else:
try:
published_time = datetime.datetime.strptime( time_tag.encode('utf8') , '%d/%m/%Y às %Hh%M')
except ValueError:
logger.error('wrong date extraction')
return None
return published_time

def extract_title(article):
"""
Extract the news title.
"""

try:
title = article.title
except Exception as ex:
template = "An exception of type {0} occured during extraction of news title. Arguments:\n{1!r}"
message = template.format(type(ex).__name__, ex.args)
logger.exception(message)
return None
return title

def extract_content(article):
"""
Extract relevant information about news page
"""

try:
body_content = article.cleaned_text
except Exception as ex:
template = "An exception of type {0} occured during extraction of news content. Arguments:\n{1!r}"
message = template.format(type(ex).__name__, ex.args)
logger.exception(message)
return None
return body_content

def detect_language(text):
"""
Detect the language of text using chromium_compact_language_detector
:param text: text to be analyzed
:return: {"name": portuguese, "pt"}
"""
name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8'))
return {"name": name, "code": code}

def compress_content(html):
"""
Compresses and encodes html content so that it can be BSON encoded an store in mongodb
:param html: original html document
:return: compressed an b64 encoded document
"""
pickled = CP.dumps(html, CP.HIGHEST_PROTOCOL)
squished = zlib.compress(pickled)
encoded = bson.Binary(squished) # b64.urlsafe_b64encode(squished)
return encoded

def decompress_content(compressed_html):
"""
Decompress data compressed by `compress_content`
:param compressed_html: compressed html document
:return: original html
"""
# unencoded = b64.urlsafe_b64decode(str(compressed_html))
decompressed = zlib.decompress(compressed_html)
orig_html = CP.loads(decompressed)
return orig_html


def download_article(url):
"""
Download the html content of a news page
:param url: news page's url
:type url: string
:return: news page's content
:rtype: requests.models.Response
"""
article = {
'link': url,
'source': 'crawler_Valor',
}
logger.info("Downloading article: %s", url)
try:
response = requests.get(url, timeout=30)
except ConnectionError:
logger.error("Failed to fetch %s", url)
return
except Timeout:
logger.error("Timed out while fetching %s", url)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd use new style string formatting here, just to keep it consistent and make the move to python3 easier later on.

return

encoding = response.encoding if response.encoding is not None else 'utf8'
dec_content = response.content.decode(encoding)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that if you use response.text requests will do this decoding work for you. If that's the case, it might be worth it to use it. On the other hand, this code is not complicated and makes it explicit that your getting a unicode object, so I don't have such a strong opinion here.

soup = BeautifulSoup(dec_content, "lxml")
extractor = Goose({'use_meta_language': False, 'target_language':'pt'})
news = extractor.extract(url=url)

article['link_content'] = compress_content(dec_content)
article['compressed'] = True
article['language'] = detect_language(dec_content)
article['title'] = extract_title(news)
article['published'] = get_published_time(soup)
article['main_text'] = extract_content(news)

return article

if __name__ =='__main__':
for url in find_articles():
print url
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's a better idea to use the logger here instead of print statements.

exists = list(ARTICLES.find({"link": url}))
if not exists:
article = download_article(url)
print 'download done'
ARTICLES.insert(article, w=1)
print 'salved'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small typo here: it should say saved instead of salved.

else:
print 'it already exists'























Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd also remove this trailing empty lines.

32 changes: 32 additions & 0 deletions capture/logging_mc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import logging
from logging.handlers import RotatingFileHandler

def get_logger( source ):
"""
Responsable for save logs of operations
:return: logger configured based on source
:rtype: logging.getLogger( source)

"""


logger = logging.getLogger(source)
logger.setLevel(logging.DEBUG)

# create stream handler and set level to debug
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.DEBUG)
file_handler = RotatingFileHandler( '/tmp/mediacloud_{0}.log'.format(source), maxBytes=5e6, backupCount=3)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to stream_handler
stream_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

# add stream_handler to logger
logger.addHandler(stream_handler) # uncomment for console output of messages
logger.addHandler(file_handler)

return logger