Skip to content

Commit

Permalink
Update __init__.py
Browse files Browse the repository at this point in the history
sloved unicode error in some charset like gb2312 etc.
add gettitle  getdate method and remove getimage.
add segement load source from local file.
  • Loading branch information
Caimany committed Jun 1, 2015
1 parent 48aae11 commit 149bd1d
Showing 1 changed file with 40 additions and 24 deletions.
64 changes: 40 additions & 24 deletions src/boilerpipe/extract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import socket
import charade
import threading
import re

socket.setdefaulttimeout(15)
lock = threading.Lock()
Expand All @@ -18,7 +19,8 @@ class Extractor(object):
being one of the boilerpipe extractors:
- DefaultExtractor
- ArticleExtractor
- ArticleSentencesExtractor
- ArticleSentencesExtractor file='/home/mj/t20150528_653893.htm'
- KeepEverythingExtractor
- KeepEverythingWithMinKWordsExtractor
- LargestContentExtractor
Expand All @@ -29,7 +31,7 @@ class Extractor(object):
source = None
data = None
headers = {'User-Agent': 'Mozilla/5.0'}

def __init__(self, extractor='DefaultExtractor', **kwargs):
if kwargs.get('url'):
request = urllib2.Request(kwargs['url'], headers=self.headers)
Expand All @@ -38,11 +40,34 @@ def __init__(self, extractor='DefaultExtractor', **kwargs):
encoding = connection.headers['content-type'].lower().split('charset=')[-1]
if encoding.lower() == 'text/html':
encoding = charade.detect(self.data)['encoding']
self.data = unicode(self.data, encoding)
# self.data = unicode(self.data, 'gbk')
#self.data = self.data.decode(encoding, 'ignore')
try:
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except UnicodeError:
encoding = charade.detect(self.data)['encoding']
self.data = self.data.decode(encoding, 'ignore')
elif kwargs.get('html'):
self.data = kwargs['html']
if not isinstance(self.data, unicode):
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
# self.data = unicode(self.data, charade.detect(self.data)['encoding'])
try:
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except UnicodeError:
encoding = charade.detect(self.data)['encoding']
self.data = self.data.decode(encoding, 'ignore')
## Extractor(extractor='ArticleExtractor',file='/tmp/a.html')
elif kwargs.get('file'):
Path = kwargs['file']
f = open(Path, 'r')
self.data = f.read()
if not isinstance(self.data, unicode):
try:
self.data = unicode(self.data, charade.detect(self.data)['encoding'])
except UnicodeError:
encoding = charade.detect(self.data)['encoding']
self.data = self.data.decode(encoding, 'ignore')

else:
raise Exception('No text or url provided')

Expand All @@ -52,35 +77,26 @@ def __init__(self, extractor='DefaultExtractor', **kwargs):
if jpype.isThreadAttachedToJVM() == False:
jpype.attachThreadToJVM()
lock.acquire()

self.extractor = jpype.JClass(
"de.l3s.boilerpipe.extractors."+extractor).INSTANCE
finally:
lock.release()

reader = StringReader(self.data)
self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
self.extractor.process(self.source)

def getText(self):
return self.source.getContent()


def getTitle(self):
return self.source.getTitle()

def getHTML(self):
highlighter = HTMLHighlighter.newExtractingInstance()
return highlighter.process(self.source, self.data)

def getImages(self):
extractor = jpype.JClass(
"de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE
images = extractor.process(self.source, self.data)
jpype.java.util.Collections.sort(images)
images = [
{
'src' : image.getSrc(),
'width' : image.getWidth(),
'height': image.getHeight(),
'alt' : image.getAlt(),
'area' : image.getArea()
} for image in images
]
return images

def getDate(self):
r='(19[7-9][0-9]|20[0-1][0-9])-(0[1-9]|1[0-2])-([1-2][0-9]|0[1-9]|3[0-1]) ([0-1][0-9]|2[0-4]):([0-5][0-9]):([0-5][0-9])'
return re.search(r,self.data).group()

1 comment on commit 149bd1d

@Caimany
Copy link
Author

@Caimany Caimany commented on 149bd1d Jun 1, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have removed the getImage() method , because I did't find any relevant source about ImageExtractor in biolerpipe-1.2.

Please sign in to comment.