-
Notifications
You must be signed in to change notification settings - Fork 0
/
scan.py
74 lines (61 loc) · 1.89 KB
/
scan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import boto3
from better_profanity import profanity
import os
from dotenv import load_dotenv
import re
load_dotenv()
bucket = os.getenv("AWS_BUCKET_NAME")
region = os.getenv("AWS_REGION")
client = boto3.client('textract', region_name=region)
def clean_text(text, return_type):
cleaned_text = []
for item in text:
item = re.sub(r'\x1b\[\d+m', '', item)
item = re.sub(r'\x1b\[\d+m', '', item)
cleaned_text.append(item)
if return_type == "list":
return cleaned_text
else:
cleaned_text = ' '.join(cleaned_text)
return cleaned_text
def scan_web_url_image(url, return_text_type):
response = client.detect_document_text(
Document={
'S3Object': {
'Bucket': bucket,
'Name': url
}
}
)
text = []
for item in response["Blocks"]:
if item["BlockType"] == "LINE":
text.append('\033[94m' + item["Text"] + '\033[0m')
if return_text_type == "clean":
print(text)
return clean_text(text, "string")
else:
print(text)
return clean_text(text, "list")
def find_matching_text(college_name, name, ht_number, s3_image_url):
try:
result = scan_web_url_image(s3_image_url, "clean")
result = result.lower()
print(result)
if result.__contains__(college_name.lower()) and result.__contains__(name.lower()) and result.__contains__(ht_number.lower()):
return True
else:
return False
except Exception as e:
return False
def check_image_for_profanity(s3_image_url):
try:
result = scan_web_url_image(s3_image_url, "raw")
for text in result:
if profanity.contains_profanity(str(text.lower())):
return True
else:
continue
return False
except Exception as e:
return False