forked from andrewferguson/YahooGroups-Archiver
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharchive_group.py
133 lines (113 loc) · 5.01 KB
/
archive_group.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
'''
Yahoo-Groups-Archiver Copyright 2015 Andrew Ferguson
YahooGroups-Archiver, a simple python script that allows for all
messages in a public Yahoo Group to be archived.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
yahoo_username = "joe-blogs"
yahoo_password = "password"
import json #required for reading various JSON attributes from the content
import urllib2 #required for fetching the raw messages
import os #required for checking if a file exists locally
import time #required if Yahoo blocks access temporarily (to wait)
import sys #required to cancel script if blocked by Yahoo
import shutil #required for deletung an old folder
import glob #required to find the most recent message downloaded
import time #required to log the date and time of run
def archive_group(groupName, mode="update"):
log("\nArchiving group '" + groupName + "', mode: " + mode + " , on " + time.strftime("%c"), groupName)
startTime = time.time()
msgsArchived = 0
if mode == "retry":
#don't archive any messages we already have
#but try to archive ones that we don't, and may have
#already attempted to archive
min = 1
elif mode == "update":
#start archiving at the last+1 message message we archived
mostRecent = 1
if os.path.exists(groupName):
oldDir = os.getcwd()
os.chdir(groupName)
for file in glob.glob("*.json"):
if int(file[0:-5]) > mostRecent:
mostRecent = int(file[0:-5])
os.chdir(oldDir)
min = mostRecent
elif mode == "restart":
#delete all previous archival attempts and archive everything again
if os.path.exists(groupName):
shutil.rmtree(groupName)
min = 1
else:
print "You have specified an invalid mode (" + mode + ")."
print "Valid modes are:\nupdate - add any new messages to the archive\nretry - attempt to get all messages that are not in the archive\nrestart - delete archive and start from scratch"
sys.exit()
if not os.path.exists(groupName):
os.makedirs(groupName)
max = group_messages_max(groupName)
for x in range(min,max+1):
if not os.path.isfile(groupName + '/' + str(x) + ".json"):
print ("Archiving message " + str(x) + " of " + str(max))
sucsess = archive_message(groupName, x)
if sucsess == True:
msgsArchived = msgsArchived + 1
log("Archive finished, archived " + str(msgsArchived) + ", time taken is " + str(time.time() - startTime) + " seconds", groupName)
def group_messages_max(groupName):
resp = urllib2.urlopen('https://groups.yahoo.com/api/v1/groups/' + groupName + '/messages?count=1&sortOrder=desc&direction=-1')
try:
pageHTML = resp.read()
pageJson = json.loads(pageHTML)
except ValueError:
if "Sign in to your account" in pageHTML and "Keep me signed in" in pageHTML:
#the user needs to be signed in to Yahoo
print "Error. The group you are trying to archive is a private group. Only public groups (groups who's messages can be viewed by non-members) can be archived at this time (hopefully will change soon...)"
sys.exit()
return pageJson["ygData"]["totalRecords"]
def archive_message(groupName, msgNumber, depth=0):
global failed
failed = False
try:
resp = urllib2.urlopen('https://groups.yahoo.com/api/v1/groups/' + groupName + '/messages/' + str(msgNumber) + '/raw')
except urllib2.HTTPError, e:
#some other problem, perhaps being refused access by Yahoo?
#retry for a max of 3 times anyway
if depth < 3:
print "Cannot get message " + str(msgNumber) + ", attempt " + str(depth+1) + " of 3"
time.sleep(0.1)
archive_message(groupName,msgNumber,depth+1)
else:
if str(e) == "HTTP Error 500: Server Error":
#we are most likely being blocked by Yahoo
log("Archive halted - it appears Yahoo has blocked you.", groupName)
log("Check if you can access the group's homepage from your browser. If you can't, you have been blocked.", groupName)
log("Don't worry, in a few hours (normally less than 3) you'll be unblocked and you can run this script again - it'll continue where you left off." ,groupName)
sys.exit()
log("Failed to retrive message " + str(msgNumber), groupName )
failed = True
if failed == True:
return False
msgJson = resp.read()
writeFile = open((groupName + "/" + str(msgNumber) + ".json"), "wb")
writeFile.write(msgJson)
writeFile.close()
return True
def log(msg, groupName):
print msg
logF = open(groupName + ".txt", "a")
logF.write("\n" + msg)
if __name__ == "__main__":
os.chdir(os.path.dirname(os.path.abspath(__file__)))
if len(sys.argv) > 2:
archive_group(sys.argv[1], sys.argv[2])
else:
archive_group(sys.argv[1])