diff --git a/iati_datastore/iatilib/crawler.py b/iati_datastore/iatilib/crawler.py index e60a8378..91c6f2aa 100644 --- a/iati_datastore/iatilib/crawler.py +++ b/iati_datastore/iatilib/crawler.py @@ -157,8 +157,8 @@ def hash(string): return m.digest() -def parse_activity(new_identifiers, old_xml, resource): - for activity in parse.document_from_bytes(resource.document, resource): +def parse_activity(new_identifiers, old_xml, resource, ignore_hashes=False): + for activity in parse.document_from_bytes(resource.document, resource, ignore_hashes): activity.resource = resource if activity.iati_identifier not in new_identifiers: @@ -184,7 +184,7 @@ def parse_activity(new_identifiers, old_xml, resource): db.session.commit() -def parse_resource(resource): +def parse_resource(resource, ignore_hashes=False): db.session.add(resource) current = Activity.query.filter_by(resource_url=resource.url) current_identifiers = set([i.iati_identifier for i in current.all()]) @@ -197,7 +197,7 @@ def parse_resource(resource): db.session.query(Activity).filter_by(resource_url=resource.url).delete() new_identifiers = set() - parse_activity(new_identifiers, old_xml, resource) + parse_activity(new_identifiers, old_xml, resource, ignore_hashes) resource.version = parse.document_metadata(resource.document) @@ -240,7 +240,7 @@ def update_activities(dataset_name, ignore_hashes=False): dataset = Dataset.query.get(dataset_name) resource = dataset.resources[0] - if ignore_hashes: db.session._update_all_unique = True + #if ignore_hashes: db.session._update_all_unique = True try: db.session.query(Log).filter(sa.and_( @@ -248,7 +248,7 @@ def update_activities(dataset_name, ignore_hashes=False): ['activity_importer', 'failed_activity', 'xml_parser']), Log.resource == dataset_name, )).delete(synchronize_session=False) - parse_resource(resource) + parse_resource(resource, ignore_hashes) db.session.commit() except parse.ParserError as exc: db.session.rollback() @@ -265,7 +265,7 @@ def update_activities(dataset_name, ignore_hashes=False): )) db.session.commit() - if ignore_hashes: db.session._update_all_unique = False + #if ignore_hashes: db.session._update_all_unique = False def update_dataset(dataset_name, ignore_hashes): ''' diff --git a/iati_datastore/iatilib/model.py b/iati_datastore/iatilib/model.py index ba4e5428..bcde9851 100644 --- a/iati_datastore/iatilib/model.py +++ b/iati_datastore/iatilib/model.py @@ -31,15 +31,16 @@ # The "Unique Object" pattern # http://www.sqlalchemy.org/trac/wiki/UsageRecipes/UniqueObject def _unique(session, cls, hashfunc, queryfunc, constructor, arg, kw): + kwargs = {k: kw[k] for k in kw if k != 'ignore_hashes'} cache = getattr(session, '_unique_cache', None) if cache is None: session._unique_cache = cache = {} - key = (cls, hashfunc(*arg, **kw)) + key = (cls, hashfunc(*arg, **kwargs)) if key in cache: - if getattr(session, '_update_all_unique', False): + if kw['ignore_hashes']: obj = cache[key] - for name, value in kw.items(): + for name, value in kwargs.items(): setattr(obj, name, value) return obj else: @@ -47,10 +48,10 @@ def _unique(session, cls, hashfunc, queryfunc, constructor, arg, kw): else: with session.no_autoflush: q = session.query(cls) - q = queryfunc(q, *arg, **kw) + q = queryfunc(q, *arg, **kwargs) obj = q.first() if not obj: - obj = constructor(*arg, **kw) + obj = constructor(*arg, **kwargs) session.add(obj) cache[key] = obj return obj diff --git a/iati_datastore/iatilib/parse.py b/iati_datastore/iatilib/parse.py index 3f10f4d3..7307565f 100644 --- a/iati_datastore/iatilib/parse.py +++ b/iati_datastore/iatilib/parse.py @@ -118,7 +118,7 @@ def xvals_lang(xml, major_version): return ret -def parse_org(xml, resource=no_resource, major_version='1'): +def parse_org(ignore_hashes, xml, resource=no_resource, major_version='1'): data = { "ref": xval(xml, "@ref", u""), "name": xval(xml, TEXT_ELEMENT[major_version], u""), @@ -128,10 +128,10 @@ def parse_org(xml, resource=no_resource, major_version='1'): data['type'] = codelists.by_major_version[major_version].OrganisationType.from_string(xval(xml, "@type")) except (MissingValue, ValueError): data['type'] = None - return Organisation.as_unique(db.session, **data) + return Organisation.as_unique(db.session, **data, ignore_hashes=ignore_hashes) -def reporting_org(element, resource=no_resource, major_version='1'): +def reporting_org(ignore_hashes, element, resource=no_resource, major_version='1'): try: xml = element.xpath("./reporting-org")[0] except IndexError: @@ -157,10 +157,10 @@ def reporting_org(element, resource=no_resource, major_version='1'): exc_info=exe ) - return Organisation.as_unique(db.session, **data) + return Organisation.as_unique(db.session, **data, ignore_hashes=ignore_hashes) -def participating_orgs(xml, resource=None, major_version='1'): +def participating_orgs(ignore_hashes, xml, resource=None, major_version='1'): ret = [] seen = set() for ele in xml.xpath("./participating-org"): @@ -181,7 +181,7 @@ def participating_orgs(xml, resource=None, major_version='1'): role = codelists.by_major_version['1'].OrganisationRole.from_string(value) else: role = codelists.by_major_version[major_version].OrganisationRole.from_string(xval(ele, "@role").title()) - organisation = parse_org(ele, major_version=major_version) + organisation = parse_org(ignore_hashes, ele, major_version=major_version) if not (role, organisation.ref) in seen: seen.add((role, organisation.ref)) ret.append(Participation(role=role, organisation=organisation)) @@ -327,14 +327,14 @@ def description_all_values(xml, resource=None, major_version='1'): return ret -def transactions(xml, resource=no_resource, major_version='1'): +def transactions(ignore_hashes, xml, resource=no_resource, major_version='1'): def from_cl(code, codelist): return codelist.from_string(code) if code is not None else None def from_org(path, ele, resource=None, major_version='1'): organisation = ele.xpath(path) if organisation: - return parse_org(organisation[0], major_version=major_version) + return parse_org(ignore_hashes, organisation[0], major_version=major_version) # return Organisation.as_unique(db.session, ref=org) if org else Nonejk def process(ele): @@ -549,7 +549,7 @@ def from_codelist_with_major_version(codelist_name, path, xml, resource, major_v return from_codelist(getattr(codelists.by_major_version[major_version], codelist_name), path, xml, resource) -def activity(xml, resource=no_resource, major_version='1', version=None): +def activity(xml, resource=no_resource, major_version='1', version=None, ignore_hashes=False): """ Expects xml argument of type lxml.etree._Element """ @@ -585,12 +585,12 @@ def activity(xml, resource=no_resource, major_version='1', version=None): "hierarchy": hierarchy, "last_updated_datetime": last_updated_datetime, "default_language": default_language, - "reporting_org": reporting_org, + "reporting_org": partial(reporting_org, ignore_hashes), "websites": websites, - "participating_orgs": participating_orgs, + "participating_orgs": partial(participating_orgs, ignore_hashes), "recipient_country_percentages": recipient_country_percentages, "recipient_region_percentages": recipient_region_percentages, - "transactions": transactions, + "transactions": partial(transactions, ignore_hashes), "start_planned": start_planned, "end_planned": end_planned, "start_actual": start_actual, @@ -637,15 +637,15 @@ def activity(xml, resource=no_resource, major_version='1', version=None): return Activity(**data) -def document_from_bytes(xml_resource, resource=no_resource): - return activities(BytesIO(xml_resource), resource) +def document_from_bytes(xml_resource, resource=no_resource, ignore_hashes=False): + return activities(BytesIO(xml_resource), resource, ignore_hashes) def document_from_file(xml_resource, resource=no_resource): return activities(open(xml_resource, 'rb'), resource) -def activities(xmlfile, resource=no_resource): +def activities(xmlfile, resource=no_resource, ignore_hashes=False): major_version = '1' version = None try: @@ -656,7 +656,7 @@ def activities(xmlfile, resource=no_resource): major_version = '2' elif event == 'end' and elem.tag == 'iati-activity': try: - yield activity(elem, resource=resource, major_version=major_version, version=version) + yield activity(elem, resource=resource, major_version=major_version, version=version, ignore_hashes=ignore_hashes) except MissingValue as exe: log.error(_("Failed to import a valid Activity error was: {0}".format(exe), logger='failed_activity', dataset=resource.dataset_id, resource=resource.url), diff --git a/iati_datastore/iatilib/test/test_parser.py b/iati_datastore/iatilib/test/test_parser.py index 5c731ecb..106cf312 100644 --- a/iati_datastore/iatilib/test/test_parser.py +++ b/iati_datastore/iatilib/test/test_parser.py @@ -696,7 +696,7 @@ class TestOrganisation(AppTestCase): def test_org_role_looseness(self): # organisationrole should be "Implementing" but can be "implementing". # This also tests role V1->V2 mapping. - orgrole = parse.participating_orgs(ET.XML( + orgrole = parse.participating_orgs(False, ET.XML( u'' ))[0] self.assertEquals( @@ -705,13 +705,13 @@ def test_org_role_looseness(self): ) def test_org_type(self): - orgtype = parse.reporting_org(ET.XML( + orgtype = parse.reporting_org(False, ET.XML( u"""""" )) self.assertEquals(cl.OrganisationType.international_ngo, orgtype.type) def test_org_type_missing(self): - orgtype = parse.reporting_org(ET.XML( + orgtype = parse.reporting_org(False, ET.XML( u"""""" )) self.assertEquals(None, orgtype.type) @@ -720,7 +720,7 @@ def test_org_type_missing(self): class TestParticipation(AppTestCase): def test_repeated_participation(self): # Identical participations should be filtered - participations = parse.participating_orgs( + participations = parse.participating_orgs(False, ET.XML(u""" Concern Universal @@ -731,7 +731,7 @@ def test_repeated_participation(self): self.assertEquals(1, len(participations)) def test_same_org_different_role(self): - participations = parse.participating_orgs( + participations = parse.participating_orgs(False, ET.XML(u""" Concern Universal Concern Universal @@ -794,7 +794,7 @@ def __init__(self, methodName='runTest'): """ def test_missing_code(self): - transactions = parse.transactions( + transactions = parse.transactions(False, ET.XML(u''' test @@ -805,7 +805,7 @@ def test_missing_code(self): self.assertEquals(1, len(transactions)) def test_big_value(self): - transaction = parse.transactions( + transaction = parse.transactions(False, ET.XML(u''' test @@ -817,7 +817,7 @@ def test_big_value(self): @mock.patch('iatilib.parse.iati_decimal') def test_iati_int_called(self, mock): - parse.transactions( + parse.transactions(False, ET.XML(u''' test @@ -837,7 +837,7 @@ def test_provider_activity_id(self): """ - transaction = parse.transactions(ET.XML(sample))[0] + transaction = parse.transactions(False, ET.XML(sample))[0] self.assertEquals(u'GB-1-202907', transaction.provider_org_activity_id) def test_provider_org_text(self): @@ -848,7 +848,7 @@ def test_provider_org_text(self): """ - transaction = parse.transactions(ET.XML(sample))[0] + transaction = parse.transactions(False, ET.XML(sample))[0] self.assertEquals(u'DFID', transaction.provider_org_text) def test_receiver_activity_id(self): @@ -859,7 +859,7 @@ def test_receiver_activity_id(self): """ - transaction = parse.transactions(ET.XML(sample))[0] + transaction = parse.transactions(False, ET.XML(sample))[0] self.assertEquals(u'GB-CHC-1068839-dfid_ag_11-13', transaction.receiver_org_activity_id) def test_receiver_org_text(self): @@ -870,7 +870,7 @@ def test_receiver_org_text(self): """ - transaction = parse.transactions(ET.XML(sample))[0] + transaction = parse.transactions(False, ET.XML(sample))[0] self.assertEquals(u'Bond', transaction.receiver_org_text) def test_description(self): @@ -880,30 +880,30 @@ def test_description(self): Funds received from DFID for activities in Aug- Sept 2011 """ - transaction = parse.transactions(ET.XML(sample))[0] + transaction = parse.transactions(False, ET.XML(sample))[0] self.assertEquals( u'Funds received from DFID for activities in Aug- Sept 2011', transaction.description ) def test_flow_type(self): - transaction = parse.transactions(ET.XML(self.codelists))[0] + transaction = parse.transactions(False, ET.XML(self.codelists))[0] self.assertEquals(u'30', transaction.flow_type.value) def test_finance_type(self): - transaction = parse.transactions(ET.XML(self.codelists))[0] + transaction = parse.transactions(False, ET.XML(self.codelists))[0] self.assertEquals(u'110', transaction.finance_type.value) def test_aid_type(self): - transaction = parse.transactions(ET.XML(self.codelists))[0] + transaction = parse.transactions(False, ET.XML(self.codelists))[0] self.assertEquals(u'B01', transaction.aid_type.value) def test_tied_status(self): - transaction = parse.transactions(ET.XML(self.codelists))[0] + transaction = parse.transactions(False, ET.XML(self.codelists))[0] self.assertEquals(u'5', transaction.tied_status.value) def test_disbursement_channel(self): - transaction = parse.transactions(ET.XML(self.codelists))[0] + transaction = parse.transactions(False, ET.XML(self.codelists))[0] self.assertEquals(u'2', transaction.disbursement_channel.value)