From bd7181066f51719402d4ab09df3839d141c6b3fb Mon Sep 17 00:00:00 2001
From: alickrxu <alickrxu@gmail.com>
Date: Wed, 20 Apr 2016 15:06:10 -0700
Subject: [PATCH] updated quantcast scraper

---
 src/qcast.py | 57 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/src/qcast.py b/src/qcast.py
index 24b3260..5821431 100644
--- a/src/qcast.py
+++ b/src/qcast.py
@@ -14,28 +14,47 @@
 import sys
 import time
 
-from BeautifulSoup import BeautifulSoup as bs
+from bs4 import BeautifulSoup as bs
 
 
 def read_file(filename):
     return [l.strip() for l in open(filename, 'rb').readlines()]
 
+def get_demographics():
+    country = 'EG' #specify country you want top sites from
+    url = 'https://www.quantcast.com/'
+    classes = ['tr-GENDER', 'tr-AGE', 'tr-CHILDREN', 'tr-INCOME', 'tr-EDUCATION', 'tr-ETHNICITY', 'tr-POLITICS']
+    brands = read_file(sys.argv[1])
+    s = requests.Session()
 
-url = 'https://www.quantcast.com/'
-classes = ['tr-GENDER', 'tr-AGE', 'tr-CHILDREN', 'tr-INCOME', 'tr-EDUCATION', 'tr-ETHNICITY', 'tr-POLITICS']
-brands = read_file(sys.argv[1])
-for brand in brands:
-    u = url + brand + '/demographics'
-    data = {}
-    data['brand'] = brand
-    try:
-        soup = bs(requests.get(u).text)
-        for cl in soup.findAll(attrs={'class': 'demographics-composition'}):
-            for tr in cl.findAll('tr'):
-                label = tr.findChild(attrs={'style': 'text-align:left; padding-right:2px; width:108px'}).contents[0].strip()
-                value = tr.findChild(attrs={'class': re.compile(r"index-digit.*")}).contents[0].strip()
-                data[label] = value
-        print json.dumps(data, sort_keys=True)
-        time.sleep(1)  # Be nice and sleep 1 second between calls.
-    except Exception as e:
-        sys.stderr.write('exception %s, skipping\n' % e)
+    #need these cookies to access old quantcast page which we can scrap data from
+    oldQuant = {
+        'viewNewProfile' : '0', 
+        'expires' : 'Fri, 1 Jul 2016 00:00:00 GMT', 
+        'path' : '/'}
+
+    demographics = []
+
+    for brand in brands:
+        u = url + brand + '/demographics' + '?country=' + country
+        data = {}
+        data['brand'] = brand
+        try:
+            soup = bs(s.get(u, cookies=oldQuant).text, 'html.parser')
+            #print(soup.prettify())
+            for cl in soup.findAll(attrs={'class': 'demographics-composition'}):
+                for tr in cl.findAll('tr'):
+                    label = tr.findChild(attrs={'style': 'text-align:left; padding-right:2px; width:108px'}).contents[0].strip()
+                    value = tr.findChild(attrs={'class': re.compile(r"index-digit.*")}).contents[0].strip()
+                    data[label] = value
+            print json.dumps(data, sort_keys=True)
+            demographics.append(json.dumps(data, sort_keys=True))
+            time.sleep(1)  # Be nice and sleep 1 second between calls.
+        except Exception as e:
+            sys.stderr.write('exception %s, skipping\n' % e)
+
+    with open("demographics.txt", "w") as myfile:
+        for demo in demographics:
+            myfile.write(demo + '\n')
+
+get_demographics()
\ No newline at end of file