Skip to content

Commit

Permalink
Merge pull request #140 from osamhack2021/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
dev-taewon-kim authored Oct 11, 2021
2 parents ca8d4d2 + 49b041b commit df63ff7
Show file tree
Hide file tree
Showing 71 changed files with 3,271 additions and 1,677 deletions.
10 changes: 5 additions & 5 deletions WEB/backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ sudo docker-compose -version

## Usage

#### Analyzer
1. Move to ```~/WEB/NLP/``` and run command ```docker-compose up```
2. Move to ```~/WEB/backend/``` and run command ```chmod a+x analyzer.sh```
3. Run command ```./analyzer.sh```

#### Django
1. Move to ```~/WEB/backend/``` and run command ```cp web-docker-env-example web-docker-env```
2. Edit ```web-docker-env``` with your own credentials.
3. Move to ```~/WEB/backend/drf/``` and run command ```cp secrets.example.json secrets.json```
4. Edit ```secrets.json``` with your own credentials.
5. Move to ```~/WEB/backend/``` and run command ```chmod a+x web.sh```
6. Run command ```./web.sh```

#### Analyzer
1. Move to ```~/WEB/NLP/``` and run command ```docker-compose up```
2. Move to ```~/WEB/backend/``` and run command ```chmod a+x analyzer.sh```
3. Run command ```./analyzer.sh```
24 changes: 13 additions & 11 deletions WEB/backend/analyzer-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ services:
extra_hosts:
- "host.docker.internal:host-gateway"
entrypoint: ["/bin/bash","-c"]
command:
- |
pip install --no-cache-dir -r /analyzer/requirements.txt
python -u /crawler/main.py
python -u /analyzer/analyzer.py
# dev only
# command:
# - |
# pip install --no-cache-dir -r /analyzer/requirements.txt
# python -u /crawler/main.py
# python -u /analyzer/analyzer.py
depends_on:
- mongo
- mongo-seed
Expand All @@ -37,9 +38,10 @@ services:
depends_on:
- mongo

analyzer-restarter:
container_name: analyzer-restarter_service
image: docker
volumes: ["/var/run/docker.sock:/var/run/docker.sock"]
command: ["/bin/sh", "-c", "while true; do sleep 3600; docker restart analyzer_service; done"]
restart: unless-stopped
# dev only
# analyzer-restarter:
# container_name: analyzer-restarter_service
# image: docker
# volumes: ["/var/run/docker.sock:/var/run/docker.sock"]
# command: ["/bin/sh", "-c", "while true; do sleep 3600; docker restart analyzer_service; done"]
# restart: unless-stopped
113 changes: 70 additions & 43 deletions WEB/backend/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,16 @@ def __init__(self, extracted):
self.content_dict = extracted
"""
id: (mongoDB์— insert ๋˜๊ธฐ ์ง์ „์— ์ƒ์„ฑ๋จ)
created_at: (mongoDB์— insert ๋˜๊ธฐ ์ง์ „์— ์ƒ์„ฑ๋จ)
site_url, thumbnail_url, category, title, contentBody, author: extracted ์—์„œ ์ถ”์ถœ
site_url, thumbnail_url, category, title, contentBody, author, created_at: extracted ์—์„œ ์ถ”์ถœ
summarized, positivity, entities: analyze ํ˜ธ์ถœ ์ดํ›„ ํ• ๋‹น
"""

self.getSummarized()
self.getPositivity()
self.getEntities()
self.getTrueScore()
self.content_dict['isAnalyzed'] = True


Expand Down Expand Up @@ -113,10 +113,39 @@ def getEntities(self):
except Exception as e:
print(f"Error occured while fetching entities : {e}")
self.content_dict['entities'] = None


def getTrueScore(self):
if self.content_dict['category'] == 'news':
url = SERVER_URL + 'fakenews'
document = {"document": self.content_dict['contentBody']}
document = json.dumps(document)

try:
true_score = requests.post(url, data=document, timeout=20)

if true_score.status_code == 200:
try:
self.content_dict['true_score'] = json.loads(true_score.text)['true_score']
except Exception as e:
print(f"Error occured while true_score data : {e}")
self.content_dict['true_score'] = None

else:
print(f"Error occured while fetching true_score data : {true_score.status_code}")
self.content_dict['true_score'] = None

except Exception as e:
print(f"Error occured while fetching true_score data : {e}")
self.content_dict['true_score'] = None

else:
self.content_dict['true_score'] = None


class DBHandler:
def __init__(self):
# host = "localhost"
host = "host.docker.internal"
port = "8001"
self.client = MongoClient(host, int(port))
Expand Down Expand Up @@ -216,59 +245,56 @@ def extractor(data):
conn.close()
quit()

contents = []

for idx, tup in enumerate(data):
extracted = {}
extracted['title'] = tup[0]
extracted['site_url'] = tup[1]
extracted['thumbnail_url'] = tup[2]
extracted['contentBody'] = unicodedata.normalize('NFKC', tup[3]) # ๊ณต๋ฐฑ ๋ฌธ์ž๊ฐ€ \xa0 ๋กœ ์ธ์‹๋˜๋Š” ๋ฌธ์ œ ํ•ด๊ฒฐ
extracted['category'] = tup[4]
extracted['created_at'] = datetime.strptime(tup[9].strip(), "%y_%m_%d")
extracted['author'] = tup[10]

content = Content(extracted)
contents.append(content.content_dict)

cur.execute("UPDATE CrawlContents SET isAnalyzed = 1 WHERE id = ?", (tup[7], ))
conn.commit()

print(f"[+] Extractor: {idx + 1}/{len(data)}")
if dbInserter(content.content_dict):
print(f"[+] Extractor: {idx + 1}/{len(data)}")

return contents
return None


def dbInserter(contents):
validated_contents= []
def dbInserter(content):
mongo = DBHandler()
hasNone = False

for i in range(len(contents)):
hasNone = False
for key in content:
if key in ['title', 'site_url', 'thumbnail_url', 'summarized', 'true_score']:
if content['category'] == 'news' and content[key] == None:
hasNone = True
break

for key in contents[i]:
if key in ['summarized', 'title', 'site_url', 'thumbnail_url']:
if contents[i]['category'] == 'news' and contents[i][key] == None:
hasNone = True
break

else:
if contents[i][key] is None:
hasNone = True
break
else:
if content[key] is None:
hasNone = True
break

if not hasNone:
contents[i]['_id'] = mongo.get_next_sequence('analyzed_counter', 'riskout', 'counter')
contents[i]['created_at'] = (datetime.utcnow() + timedelta(hours=9))
validated_contents.append(contents[i])
if not hasNone:
content['_id'] = mongo.get_next_sequence('analyzed_counter', 'riskout', 'counter')

try:
mongo.insert_item_many(validated_contents, "riskout", "analyzed")
print('DB insertion success')
mongo.client.close()
return True
try:
mongo.insert_item_one(content, "riskout", "analyzed")
mongo.client.close()
return True

except Exception as e:
print("DB insert error occured :", e)
mongo.client.close()
return False

except Exception as e:
print("DB insert error occured :", e)
else:
print("DB insert error occured : null found!")
mongo.client.close()
return False

Expand All @@ -285,22 +311,23 @@ def main():
for tup in raw_data:
if tup[9] not in date_list:
date_list.append(tup[9])

today = (datetime.utcnow() + timedelta(hours=9)).strftime('%y_%m_%d')

for date in date_list:
cur.execute("SELECT * FROM CrawlContents WHERE isAnalyzed = 0 AND created_at = ?", (date,))
if date != today:
important_data_list.extend(dataRanker(cur.fetchall()))
cur.execute("UPDATE CrawlContents SET isAnalyzed = 1 WHERE isAnalyzed = 0 AND created_at = ?", (date,))
cur.execute("SELECT * FROM CrawlContents WHERE isAnalyzed = 0 AND category = 'news' AND created_at = ?", (date,))
ranked_list = dataRanker(cur.fetchall())

if ranked_list:
important_data_list.extend(ranked_list)
cur.execute("UPDATE CrawlContents SET isAnalyzed = 1 WHERE isAnalyzed = 0 AND category = 'news' AND created_at = ?", (date,))
conn.commit()
else:
important_data_list.extend(dataRanker(cur.fetchall()))

cur.execute("SELECT * FROM CrawlContents WHERE isAnalyzed = 0") # news๋Š” ์ด๋ฏธ analyzed ๋˜์—ˆ๊ธฐ ๋•Œ๋ฌธ์— sns์™€ community๋งŒ ๋‚จ๋Š”๋‹ค
important_data_list.extend(cur.fetchall())


print(f"[*] Serving {len(important_data_list)} pages to Extractor...")

contents = extractor(important_data_list)
dbInserter(contents)
extractor(important_data_list)

cur.close()
conn.close()
Expand Down
120 changes: 65 additions & 55 deletions WEB/backend/crawler/crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
import crawler.db as database

# error
from crawler.error import HTMLElementsNotFoundError as notfound_error, contentLengthError
from crawler.error import HTMLElementsNotFoundError as notfound_error
from crawler.error import contentLengthError
from crawler.error import englishContentError
from crawler.error import daterangeError

# import setting values
from crawler.setting import DEBUG
Expand Down Expand Up @@ -106,6 +108,8 @@ async def get_contents(site, contents_url, urlinfo, db):
if news_content.contents_id not in db.select_id():
db.put_content(news_content)
# print(db.select_id())
except daterangeError:
raise daterangeError
except Exception as detail:
if(DEBUG):
print("an exception occured when getting information of contentsPage")
Expand All @@ -132,68 +136,74 @@ async def crawl(site):

test_breaker = 0

while prev_page != now_page: # and test_breaker < const.MAX_LISTPAGE_CRAWL:
if(DEBUG):
print('\nlisturl: ' + urlbase + str(now_page) + '\n')

try:
response = get_request(urlbase + str(now_page), site.header)
except requests.exceptions.ConnectionError as detail:
if(DEBUG):
print("in crawler/crawler.py/crawl: failed connection by following exception")
print(detail)
break
except requests.exceptions.Timeout as detail:
if(DEBUG):
print("in crawler/crawler.py/crawl: server timeout occured")
print(detail)
break
except requests.exceptions.HTTPError as detail:
try:
while prev_page != now_page: # and test_breaker < const.MAX_LISTPAGE_CRAWL:
if(DEBUG):
print("in crawler/crawler.py/crawl: unsuccessful respond occured")
print(detail)
break
except requests.exceptions.RequestException as detail:
if(DEBUG):
print("in crawler/crawler.py/crawl: any other exception occured on getting respond")
print(detail)
break
print('\nlisturl: ' + urlbase + str(now_page) + '\n')

try:
response = get_request(urlbase + str(now_page), site.header)
except requests.exceptions.ConnectionError as detail:
if(DEBUG):
print("in crawler/crawler.py/crawl: failed connection by following exception")
print(detail)
break
except requests.exceptions.Timeout as detail:
if(DEBUG):
print("in crawler/crawler.py/crawl: server timeout occured")
print(detail)
break
except requests.exceptions.HTTPError as detail:
if(DEBUG):
print("in crawler/crawler.py/crawl: unsuccessful respond occured")
print(detail)
break
except requests.exceptions.RequestException as detail:
if(DEBUG):
print("in crawler/crawler.py/crawl: any other exception occured on getting respond")
print(detail)
break

list_html = response.text
list_soup = bs(list_html, 'html.parser')
list_html = response.text
list_soup = bs(list_html, 'html.parser')

try:
now_page = site.listpage.get_nowpage(list_soup)
except notfound_error as detail:
if(DEBUG):
print("in crawler/crawler.py: now_page not found by following exception")
print(detail)
break

# ์ผ๋‹จ ๋งˆ์Œ์— ์•ˆ ๋“ค์ง€๋งŒ ์ด๋ ‡๊ฒŒ ํ•ด ๋‘์—ˆ์Šต๋‹ˆ๋‹ค.
if(now_page == prev_page):
break

try:
contents_urls= site.listpage.get_contents_urls(list_soup)
except notfound_error as detail:
if(DEBUG):
print("in crawler/crawler.py: can't found contents url by following exception")
print(detail)
break
try:
now_page = site.listpage.get_nowpage(list_soup)
except notfound_error as detail:
if(DEBUG):
print("in crawler/crawler.py: now_page not found by following exception")
print(detail)
break

# ์ผ๋‹จ ๋งˆ์Œ์— ์•ˆ ๋“ค์ง€๋งŒ ์ด๋ ‡๊ฒŒ ํ•ด ๋‘์—ˆ์Šต๋‹ˆ๋‹ค.
if(now_page == prev_page):
break
try:
contents_urls= site.listpage.get_contents_urls(list_soup)
except notfound_error as detail:
if(DEBUG):
print("in crawler/crawler.py: can't found contents url by following exception")
print(detail)
break

futures = [asyncio.ensure_future(get_contents(site, contents_url, urlinfo, db)) for contents_url in contents_urls]
futures = [asyncio.ensure_future(get_contents(site, contents_url, urlinfo, db)) for contents_url in contents_urls]

await asyncio.gather(*futures)
await asyncio.gather(*futures)

if(DEBUG):
print("nowpage: " + str(now_page) + '\n')
if(DEBUG):
print("nowpage: " + str(now_page) + '\n')

await asyncio.sleep(const.CRAWLING_LIST_INTERVAL)
await asyncio.sleep(const.CRAWLING_LIST_INTERVAL)

test_breaker += 1
prev_page = now_page
now_page += 1
test_breaker += 1
prev_page = now_page
now_page += 1
except daterangeError as detail:
if(DEBUG):
print("crawling over date by following exception")
print(detail)
break

# db.select_all()
db.close()
7 changes: 7 additions & 0 deletions WEB/backend/crawler/crawler/error.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,10 @@ def __init__(self):

def __str__(self):
return self.msg

class daterangeError(Exception):
def __init__(self):
self.msg = "date over"

def __str__(self):
return self.msg
Loading

0 comments on commit df63ff7

Please sign in to comment.