-
Notifications
You must be signed in to change notification settings - Fork 3
/
grab_urls.php
106 lines (90 loc) · 3.65 KB
/
grab_urls.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
<?php
/*
* grab_urls.php
*
* Copyright 2012 caprenter <[email protected]>
*
* This file is part of IATI Registry Refresher.
*
* IATI Registry Refresher is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* IATI Registry Refresher is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with IATI Registry Refresher. If not, see <http://www.gnu.org/licenses/>.
*
* IATI Registry Refresher relies on other free software products. See the README.txt file
* for more details.
*/
// Display errors for demo
@ini_set('error_reporting', E_ALL);
@ini_set('display_errors', 'stdout');
// Function to perform an API request against the IATI Registry CKAN v3 API
function api_request($path, $data=null, $ckan_file=null) {
$api_root = "https://iatiregistry.org/api/3/";
if ($data === null) $data_string = '{}';
else $data_string = json_encode($data);
$ch = curl_init($api_root.$path);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "POST");
curl_setopt($ch, CURLOPT_POSTFIELDS, $data_string);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, "IATI Secretariat Registry Refresher https://github.com/IATI/IATI-Registry-Refresher");
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'Content-Type: application/json',
'Content-Length: '.strlen($data_string))
);
// Try up to 5 times if we get a non- 200 status code.
for ($i=0; $i<5; $i++) {
$result = curl_exec($ch);
if (curl_getinfo($ch)['http_code'] != 200) {
// Wait a second before we retry
sleep(1);
}
else {
break;
}
}
curl_close($ch);
if ($ckan_file !== null) {
// Save CKAN json from the API call to a file
file_put_contents($ckan_file, $result, LOCK_EX);
}
return json_decode($result)->result;
}
//Empty variables
$urls = array();
//Pull all the group identifiers from the registry
//We store them in an array , $groups, for later use
$groups = api_request('action/organization_list');
//Overide the group array, e.g. for testing. Uncomment and edit the line(s) below
//$groups = array("hewlett-foundation","aa");
//$groups = array("dfid");
//Loop through each group and save the URL end-points of the data files
//You may need to set up an empty directory called "urls"
echo "Fetching:" . PHP_EOL;
foreach ($groups as $group) {
$file = "urls/" . $group;
echo $group."\n";
try {
$urls_string = '';
$result = api_request('action/package_search', array('fq'=>"organization:".$group, 'rows'=>1000000), "ckan/" . $group);
foreach ($result->results as $package) {
try {
$urls_string .= $package->name . ' ' . (string)$package->resources[0]->url . PHP_EOL;
} catch (Exception $e) {
// Catch exceptions here to prevent one url from breaking an entire publisher
print 'Caught exception in '.$file.': ' . $e->getMessage();
}
}
file_put_contents($file, $urls_string, LOCK_EX);
} catch (Exception $e) {
print 'Caught exception in '.$file.': ' . $e->getMessage();
}
}
?>