Skip to content

Commit

Permalink
import: fix import SLSP
Browse files Browse the repository at this point in the history
* Make SLSP import more robust against wrong MARC21 for example
  019 has no subfields.

Co-Authored-by: Peter Weber <[email protected]>
  • Loading branch information
rerowep committed Sep 14, 2023
1 parent 770dc5e commit 555e9d6
Showing 1 changed file with 70 additions and 85 deletions.
155 changes: 70 additions & 85 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,10 +727,9 @@ def get_fields(self, tag=None):
fields = []
items = get_field_items(self._blob_record)
for blob_key, blob_value in items:
field_data = {}
tag_value = blob_key[0:3]
tag_value = blob_key[:3]
if (tag_value == tag) or not tag:
field_data['tag'] = tag_value
field_data = {'tag': tag_value}
if len(blob_key) == 3: # if control field
field_data['data'] = blob_value.rstrip()
else:
Expand All @@ -753,9 +752,11 @@ def get_subfields(self, field, code=None):
"""Get all subfields having the given subfield code value."""
if int(field['tag']) < 10:
raise ValueError('data field expected (tag >= 01x)')
items = get_field_items(field['subfields'])
return [subfield_data for subfield_code, subfield_data in items
if (subfield_code == code) or not code]
items = get_field_items(field.get('subfields', {}))
return [
subfield_data for subfield_code, subfield_data in items
if (subfield_code == code) or not code
]

def build_value_with_alternate_graphic(
self, tag, code, label, index, link,
Expand Down Expand Up @@ -800,7 +801,7 @@ def clean_punctuation(value, punct, spaced_punct):
else:
error_print('WARNING NO VALUE:', self.bib_id, self.rero_id, tag,
code, label)
try:
with contextlib.suppress(Exception):
alt_gr = self.alternate_graphic[tag][link]
subfield = self.get_subfields(alt_gr['field'])[index]
value = clean_punctuation(subfield, punct, spaced_punct)
Expand All @@ -809,8 +810,6 @@ def clean_punctuation(value, punct, spaced_punct):
'value': value,
'language': self.get_language_script(alt_gr['script'])
})
except Exception as err:
pass
return data or None

def extract_description_from_marc_field(self, key, value, data):
Expand Down Expand Up @@ -1256,13 +1255,11 @@ def init_country(self):
"""Initialization country (008 and 044)."""
self.country = None
self.cantons = []
fields_044 = self.get_fields(tag='044')
if fields_044:
if fields_044 := self.get_fields(tag='044'):
field_044 = fields_044[0]
for cantons_code in self.get_subfields(field_044, 'c'):
try:
canton = cantons_code.split('-')[1].strip()
if canton:
if canton := cantons_code.split('-')[1].strip():
if canton in _CANTON:
self.cantons.append(canton)
else:
Expand All @@ -1275,10 +1272,8 @@ def init_country(self):
self.country = 'sz'
# We did not find a country in 044 trying 008.
if not self.country:
try:
with contextlib.suppress(Exception):
self.country = self.field_008_data[15:18].rstrip()
except Exception as err:
pass
# Use equivalent if country code is obsolete
if self.country in _OBSOLETE_COUNTRIES_MAPPING:
self.country = _OBSOLETE_COUNTRIES_MAPPING[self.country]
Expand Down Expand Up @@ -1338,7 +1333,7 @@ def init_date(self):
3. get dates from 773 $g
4. set start_date to 2050
"""
if (self.date_type_from_008 == 'q' or self.date_type_from_008 == 'n'):
if self.date_type_from_008 in ['q', 'n']:
self.date['note'] = 'Date(s) uncertain or unknown'
start_date = make_year(self.date1_from_008)
if not (start_date and start_date >= -9999 and start_date <= 2050):
Expand All @@ -1348,8 +1343,7 @@ def init_date(self):
for ind2 in ['1', '0', '2', '4', '3']:
for field_264 in fields_264:
if ind2 == field_264['ind2']:
subfields_c = self.get_subfields(field_264, 'c')
if subfields_c:
if subfields_c := self.get_subfields(field_264, 'c'):
year = re.search(r"(-?\d{1,4})", subfields_c[0])
if year:
year = int(year.group(0))
Expand All @@ -1364,8 +1358,7 @@ def init_date(self):
if not start_date:
fields_773 = self.get_fields('773')
for field_773 in fields_773:
subfields_g = self.get_subfields(field_773, 'g')
if subfields_g:
if subfields_g := self.get_subfields(field_773, 'g'):
year = re.search(r"(-?\d{4})", subfields_g[0])
if year:
year = int(year.group(0))
Expand All @@ -1374,7 +1367,7 @@ def init_date(self):
if not start_date:
start_date = 2050
self.date['note'] = \
'Date not available and automatically set to 2050'
'Date not available and automatically set to 2050'
error_print('WARNING START DATE 264:', self.bib_id, self.rero_id,
self.date1_from_008)
self.date['start_date'] = start_date
Expand All @@ -1399,7 +1392,7 @@ def get_script_from_lang(asian=False):
if asian:
default_script = 'hani'
script_per_lang = _SCRIPT_PER_LANG_ASIA
script = script_per_lang.get(self.lang_from_008, None)
script = script_per_lang.get(self.lang_from_008)
if not script:
for lang in self.langs_from_041_a:
if lang in script_per_lang:
Expand Down Expand Up @@ -1476,8 +1469,7 @@ def build_variant_title_data(self, string_set):
for field_246 in fields_246:
variant_data = {}
subfield_246_a = ''
subfields_246_a = self.get_subfields(field_246, 'a')
if subfields_246_a:
if subfields_246_a := self.get_subfields(field_246, 'a'):
subfield_246_a = subfields_246_a[0]
subfield_246_a_cleaned = remove_trailing_punctuation(
subfield_246_a, ',.', ':;/-=')
Expand All @@ -1498,37 +1490,40 @@ def build_variant_title_data(self, string_set):
subfield_a_parts = blob_value.split(':')
part_index = 0
for subfield_a_part in subfield_a_parts:
value_data = self. \
build_value_with_alternate_graphic(
'246', blob_key, subfield_a_part,
index, link, ',.', ':;/-=')
if value_data:
if value_data := self.build_value_with_alternate_graphic(
'246',
blob_key,
subfield_a_part,
index,
link,
',.',
':;/-=',
):
if part_index == 0:
variant_data['type'] = \
'bf:VariantTitle'
'bf:VariantTitle'
variant_data['mainTitle'] = value_data
else:
variant_data['subtitle'] = value_data
part_index += 1
elif blob_key in ['n', 'p']:
value_data = self. \
build_value_with_alternate_graphic(
'246', blob_key, blob_value,
index, link, ',.', ':;/-=')
if value_data:
if value_data := self.build_value_with_alternate_graphic(
'246',
blob_key,
blob_value,
index,
link,
',.',
':;/-=',
):
part_list.update_part(
value_data, blob_key, blob_value)
if blob_key != '__order__':
index += 1
the_part_list = part_list.get_part_list()
if the_part_list:
if the_part_list := part_list.get_part_list():
variant_data['part'] = the_part_list
if variant_data:
variant_list.append(variant_data)
else:
pass
# for showing the variant title skipped for debugging purpose
# print('variant skipped', subfield_246_a_cleaned)
return variant_list

def init_content_media_carrier_type(self):
Expand All @@ -1550,46 +1545,40 @@ def init_content_media_carrier_type(self):
type_key = content_media_carrier_type_per_tag[tag]
fields = self.get_fields(tag=tag)
for field in fields:
subfields_8 = self.get_subfields(field, '8')
if not subfields_8:
subfields_8 = ['0']
subfields_8 = self.get_subfields(field, '8') or ['0']
for subfield_b in self.get_subfields(field, 'b'):
type_found = False
for link in subfields_8:
linked_data = content_media_carrier_type.get(link, {})
if tag == '336':
linked_data_type_value = \
linked_data.get(type_key, [])
linked_data.get(type_key, [])
type_value = \
content_media_carrier_map_per_tag[tag].get(
content_media_carrier_map_per_tag[tag].get(
subfield_b, None)
if type_value and \
type_value not in linked_data_type_value:
type_value not in linked_data_type_value:
linked_data_type_value.append(type_value)
linked_data[type_key] = linked_data_type_value
type_found = True
else:
if link == '0' and tag == '337':
media_type_from_unlinked_337 = \
content_media_carrier_map_per_tag[tag].get(
content_media_carrier_map_per_tag[tag].get(
subfield_b, None)
linked_data_type_value = \
linked_data.get(type_key, '')
type_value = \
content_media_carrier_map_per_tag[tag].get(
subfield_b, None)
if type_value:
linked_data.get(type_key, '')
if type_value := content_media_carrier_map_per_tag[
tag
].get(subfield_b, None):
linked_data_type_value = type_value
linked_data[type_key] = linked_data_type_value
type_found = True
if tag == '338':
# extract mediaType for the fist char of $b
media_type_from_338 = \
_MEDIA_TYPE_MAPPING.get(
subfield_b[0], None)
if media_type_from_338:
if media_type_from_338 := \
_MEDIA_TYPE_MAPPING.get(subfield_b[0]):
linked_data['mediaTypeFrom338'] = \
media_type_from_338
media_type_from_338
if type_found:
content_media_carrier_type[link] = linked_data
break # subfield $b in not repetitive
Expand Down Expand Up @@ -1655,33 +1644,26 @@ def do(self, blob, ignore_missing=True, exception_handlers=None):
except Exception as err:
self.bib_id = '???'

# get the language code
fields_101 = self.get_fields(tag='101')
if fields_101:
if fields_101 := self.get_fields(tag='101'):
field_101_a = self.get_subfields(fields_101[0], 'a')
field_101_g = self.get_subfields(fields_101[0], 'g')
if field_101_a:
self.lang_from_101 = field_101_a[0]
if field_101_g:
self.lang_from_101 = field_101_g[0]

# get the type of continuing ressource
fields_110 = self.get_fields(tag='110')
if fields_110:
if fields_110 := self.get_fields(tag='110'):
field_110_a = self.get_subfields(fields_110[0], 'a')
if field_110_a and len(field_110_a[0]) > 0:
self.serial_type = field_110_a[0][0]

self.admin_meta_data = {}
enc_level = ''
if self.leader:
enc_level = self.leader[17] # LDR 17
if enc_level in _ENCODING_LEVEL_MAPPING:
encoding_level = _ENCODING_LEVEL_MAPPING[enc_level]
else:
encoding_level = _ENCODING_LEVEL_MAPPING['u']
self.admin_meta_data['encodingLevel'] = encoding_level

enc_level = self.leader[17] if self.leader else ''
encoding_level = (
_ENCODING_LEVEL_MAPPING[enc_level]
if enc_level in _ENCODING_LEVEL_MAPPING
else _ENCODING_LEVEL_MAPPING['u']
)
self.admin_meta_data = {'encodingLevel': encoding_level}
result = super().do(
blob,
ignore_missing=ignore_missing,
Expand Down Expand Up @@ -1710,8 +1692,8 @@ def get_language_script(self, unimarc_script_code):
"""
if unimarc_script_code in _UNIMARC_LANGUAGES_SCRIPTS:
script_code = _UNIMARC_LANGUAGES_SCRIPTS[unimarc_script_code]
lang = self.lang_from_101
if script_code in _LANGUAGES_SCRIPTS:
lang = self.lang_from_101
if lang in _LANGUAGES_SCRIPTS[script_code]:
return '-'.join([self.lang_from_101, script_code])
error_print('WARNING LANGUAGE SCRIPTS:', self.bib_id,
Expand All @@ -1732,7 +1714,7 @@ def get_alt_graphic_fields(self, tag=None):
items = get_field_items(self._blob_record)
for blob_key, blob_value in items:
field_data = {}
tag_value = blob_key[0:3]
tag_value = blob_key[:3]
if (tag_value == tag) or not tag:
field_data['tag'] = tag_value
if len(blob_key) == 3: # if control field
Expand All @@ -1745,7 +1727,7 @@ def get_alt_graphic_fields(self, tag=None):
subfields_7 = self.get_subfields(field_data, '7')
# alternate graphic link code start with 'a'
if subfields_6 and subfields_6[0][0] == 'a' \
and subfields_7 and subfields_7[0] != 'ba': # ba=latin
and subfields_7 and subfields_7[0] != 'ba': # ba=latin
tag_data = self.alternate_graphic.get(tag, {})
tag_data[subfields_6[0]] = {}
tag_data[subfields_6[0]]['field'] = field_data
Expand Down Expand Up @@ -1908,12 +1890,15 @@ def remove_leading_article(string, max_article_len=4):
if index == 0 and not field_245_a_end_with_equal:
if data_std.rstrip():
main_subtitle.append({'value': data_std.rstrip()})
if lang and index < len(data_lang_items):
if data_lang_items[index].rstrip():
main_subtitle.append({
'value': data_lang_items[index].rstrip(),
'language': lang
})
if (
lang
and index < len(data_lang_items)
and data_lang_items[index].rstrip()
):
main_subtitle.append({
'value': data_lang_items[index].rstrip(),
'language': lang
})
else:
main_title = []
subtitle = []
Expand Down

0 comments on commit 555e9d6

Please sign in to comment.