simplify docker compose layers, update hyku and bulkrax

notch8 · Jan 3, 2025 · 0eeb409 · 0eeb409
1 parent 144c62c
commit 0eeb409
Show file tree

Hide file tree

Showing 3 changed files with 213 additions and 168 deletions.
diff --git a/config/initializers/bulkrax.rb b/config/initializers/bulkrax.rb
@@ -2,125 +2,124 @@
 
 # Ensure Knapsack version gets loaded after Hyku's bulkrax.rb
 Rails.application.config.after_initialize do
-  if ENV.fetch('HYKU_BULKRAX_ENABLED', 'true') == 'true'
-    Bulkrax.setup do |config|
-      ##
-      # By default this is the first registered curation concern.  But based on tests and past
-      # configs, this should be 'GenericWork'.  Note: The below value could change, but it should be
-      # explicit.
-      #
-      # See https://github.com/samvera/hyku/blob/07fde572f9152d513b13f71cae90dd4fdfbfba6c/config/initializers/hyrax.rb#L200-L202
-      config.default_work_type = 'GenericWork'
-
-      # Setting the available parsers for Adventist.
-      config.parsers = [
-        { name: "OAI - Adventist Digital Library", class_name: "Bulkrax::OaiAdventistQdcParser", partial: "oai_adventist_fields" },
-        { name: "CSV - Comma Separated Values", class_name: "Bulkrax::CsvParser", partial: "csv_fields" },
-      ]
-
-      # Should Bulkrax make up source identifiers for you? This allow round tripping
-      # and download errored entries to still work, but does mean if you upload the
-      # same source record in two different files you WILL get duplicates.
-      # It is given two aruguments, self at the time of call and the index of the reocrd
-      #    config.fill_in_blank_source_identifiers = ->(parser, index) { "b-#{parser.importer.id}-#{index}"}
-      # or use a uuid
-      #    config.fill_in_blank_source_identifiers = ->(parser, index) { SecureRandom.uuid }
-
-      # Field mappings
-      # Create a completely new set of mappings by replacing the whole set as follows
-      #   config.field_mappings = {
-      #     "Bulkrax::OaiDcParser" => { **individual field mappings go here*** }
-      #   }
-
-      # Add to, or change existing mappings as follows
-      #   e.g. to exclude date
-      #   config.field_mappings["Bulkrax::OaiDcParser"]["date"] = { from: ["date"], excluded: true  }
-      #
-      # #   e.g. to add the required source_identifier field
-      #   #   config.field_mappings["Bulkrax::CsvParser"]["source_id"] = { from: ["old_source_id"], source_identifier: true  }
-      # If you want Bulkrax to fill in source_identifiers for you, see below
-
-      # To duplicate a set of mappings from one parser to another
-      #   config.field_mappings["Bulkrax::OaiOmekaParser"] = {}
-      #   config.field_mappings["Bulkrax::OaiDcParser"].each {|key,value| config.field_mappings["Bulkrax::OaiOmekaParser"][key] = value }
-      config.field_mappings['Bulkrax::OaiAdventistQdcParser'] = {
-          'abstract' => { from:  ['abstract'] },
-          'aark_id' => { from:  ['aark_id'] },
-          'identifier' => { from:  ['identifier'], source_identifier: true },
-          'bibliographic_citation' => { from:  ['bibliographic_citation'] },
-          'creator' => { from:  ['creator'] },
-          'contributor' => { from:  ['contributor'] },
-          'edition' => { from:  ['edition'] },
-          'resource_type' => { from:  ['resource_type'] },
-          'issue_number' => { from:  ['issue_number'] },
-          'language' => { from:  ['language'] },
-          'description' => { from:  ['description'] },
-          'pagination' => { from:  ['pagination'] },
-          'extent' => { from:  ['extent'], split: ';' },
-          'source' => { from:  ['source'] },
-          'date_issued' => { from:  ['date_issued'] },
-          'alt' => { from:  ['geocode'] },
-          'publisher' => { from:  ['publisher'], split: ';' },
-          'rights_statement' => { from:  ['rights_statement'] },
-          'part_of' => { from:  ['part_of'] },
-          'part' => { from:  ['part_of'] },
-          'date_created' => { from:  ['date_created'] },
-          'title' => { from:  ['title'] },
-          'subject' => { from:  ['subject'], split: ';' },
-          'volume_number' => { from:  ['volume_number'] },
-          'keyword' => { from: ['keyword'], split: ';' },
-          'location' => { from: ['location'], split: ';' },
-          'model' => { from: ['model', 'work_type'] },
-          'remote_files' => { from: ['related_url'], split: ';', parsed: true },
-          'thumbnail_url' => { from: ['thumbnail_url'], default_thumbnail: true, parsed: true },
-          'video_embed' => { from: ['video_embed'] },
-          'refereed' => { from: ['peer_reviewed'] }
-      }
-      config.field_mappings['Bulkrax::CsvParser'] = {
-          'abstract' => { from:  ['description.abstract'] },
-          'aark_id' => { from:  ['identifier.ark'] },
-          'identifier' => { from:  ['identifier'], source_identifier: true },
-          'bibliographic_citation' => { from:  ['identifier.bibliographicCitation'] },
-          'creator' => { from:  ['creator'], split: ';' },
-          'contributor' => { from:  ['contributor'], split: ';' },
-          'edition' => { from:  ['title.release'] },
-          'resource_type' => { from:  ['type'] },
-          'issue_number' => { from:  ['relation.isPartOfIssue'] },
-          'language' => { from:  ['language'], split: ';' },
-          'description' => { from:  ['description'], split: ';' },
-          'pagination' => { from:  ['format.extent'] },
-          'extent' => { from:  ['format.extent'], split: ';' },
-          'source' => { from:  ['source'], split: ';' },
-          'date_issued' => { from:  ['date'] },
-          'alt' => { from:  ['coverage.spatial'] },
-          'publisher' => { from:  ['publisher'], split: ';' },
-          'rights_statement' => { from:  ['rights'] },
-          'part_of' => { from:  ['relation.isPartOf'], split: ';' },
-          'part' => { from:  ['relation.isPartOf'] },
-          'date_created' => { from:  ['date.other'] },
-          'title' => { from:  ['title'] },
-          'subject' => { from:  ['subject'], split: ';' },
-          'volume_number' => { from:  ['relation.isPartOfVolume'] },
-          'keyword' => { from: ['keyword'], split: ';' },
-          'location' => { from: ['location'], split: ';' },
-          'model' => { from: ['work_type'] },
-          'remote_files' => { from: ['related_url'], split: ';', parsed: true },
-          'remote_url' => { from: ['official_url', 'remote_url'], split: ';' },
-          'thumbnail_url' => { from: ['thumbnail_url'], default_thumbnail: true, parsed: true },
-          'video_embed' => { from: ['video_embed'] },
-          'refereed' => { from: ['peer_reviewed'] }
-      }
+  Bulkrax.setup do |config|
+    ##
+    # By default this is the first registered curation concern.  But based on tests and past
+    # configs, this should be 'GenericWork'.  Note: The below value could change, but it should be
+    # explicit.
+    #
+    # See https://github.com/samvera/hyku/blob/07fde572f9152d513b13f71cae90dd4fdfbfba6c/config/initializers/hyrax.rb#L200-L202
+    config.default_work_type = 'GenericWork'
+
+    # Setting the available parsers for Adventist.
+    config.parsers = [
+      { name: "OAI - Adventist Digital Library", class_name: "Bulkrax::OaiAdventistQdcParser", partial: "oai_adventist_fields" },
+      { name: "CSV - Comma Separated Values", class_name: "Bulkrax::CsvParser", partial: "csv_fields" },
+    ]
+
+    # Should Bulkrax make up source identifiers for you? This allow round tripping
+    # and download errored entries to still work, but does mean if you upload the
+    # same source record in two different files you WILL get duplicates.
+    # It is given two aruguments, self at the time of call and the index of the reocrd
+    #    config.fill_in_blank_source_identifiers = ->(parser, index) { "b-#{parser.importer.id}-#{index}"}
+    # or use a uuid
+    #    config.fill_in_blank_source_identifiers = ->(parser, index) { SecureRandom.uuid }
 
-      config.field_mappings['Bulkrax::CsvParser'].merge!(
-        'parents' => { from: ['parents'], split: /\s*[;|]\s*/, related_parents_field_mapping: true },
-        'children' => { from: ['children'], split: /\s*[;|]\s*/, related_children_field_mapping: true }
-      )
+    # Field mappings
+    # Create a completely new set of mappings by replacing the whole set as follows
+    #   config.field_mappings = {
+    #     "Bulkrax::OaiDcParser" => { **individual field mappings go here*** }
+    #   }
 
-      # Lambda to set the default field mapping
-      config.default_field_mapping = lambda do |field|
-        return if field.blank?
-        {
-          field.to_s =>
+    # Add to, or change existing mappings as follows
+    #   e.g. to exclude date
+    #   config.field_mappings["Bulkrax::OaiDcParser"]["date"] = { from: ["date"], excluded: true  }
+    #
+    # #   e.g. to add the required source_identifier field
+    #   #   config.field_mappings["Bulkrax::CsvParser"]["source_id"] = { from: ["old_source_id"], source_identifier: true  }
+    # If you want Bulkrax to fill in source_identifiers for you, see below
+
+    # To duplicate a set of mappings from one parser to another
+    #   config.field_mappings["Bulkrax::OaiOmekaParser"] = {}
+    #   config.field_mappings["Bulkrax::OaiDcParser"].each {|key,value| config.field_mappings["Bulkrax::OaiOmekaParser"][key] = value }
+    config.field_mappings['Bulkrax::OaiAdventistQdcParser'] = {
+      'abstract' => { from:  ['abstract'] },
+      'aark_id' => { from:  ['aark_id'] },
+      'identifier' => { from:  ['identifier'], source_identifier: true },
+      'bibliographic_citation' => { from:  ['bibliographic_citation'] },
+      'creator' => { from:  ['creator'] },
+      'contributor' => { from:  ['contributor'] },
+      'edition' => { from:  ['edition'] },
+      'resource_type' => { from:  ['resource_type'] },
+      'issue_number' => { from:  ['issue_number'] },
+      'language' => { from:  ['language'] },
+      'description' => { from:  ['description'] },
+      'pagination' => { from:  ['pagination'] },
+      'extent' => { from:  ['extent'], split: ';' },
+      'source' => { from:  ['source'] },
+      'date_issued' => { from:  ['date_issued'] },
+      'alt' => { from:  ['geocode'] },
+      'publisher' => { from:  ['publisher'], split: ';' },
+      'rights_statement' => { from:  ['rights_statement'] },
+      'part_of' => { from:  ['part_of'] },
+      'part' => { from:  ['part_of'] },
+      'date_created' => { from:  ['date_created'] },
+      'title' => { from:  ['title'] },
+      'subject' => { from:  ['subject'], split: ';' },
+      'volume_number' => { from:  ['volume_number'] },
+      'keyword' => { from: ['keyword'], split: ';' },
+      'location' => { from: ['location'], split: ';' },
+      'model' => { from: ['model', 'work_type'] },
+      'remote_files' => { from: ['related_url'], split: ';', parsed: true },
+      'thumbnail_url' => { from: ['thumbnail_url'], default_thumbnail: true, parsed: true },
+      'video_embed' => { from: ['video_embed'] },
+      'refereed' => { from: ['peer_reviewed'] }
+    }
+    config.field_mappings['Bulkrax::CsvParser'] = {
+      'abstract' => { from:  ['description.abstract'] },
+      'aark_id' => { from:  ['identifier.ark'] },
+      'identifier' => { from:  ['identifier'], source_identifier: true },
+      'bibliographic_citation' => { from:  ['identifier.bibliographicCitation'] },
+      'creator' => { from:  ['creator'], split: ';' },
+      'contributor' => { from:  ['contributor'], split: ';' },
+      'edition' => { from:  ['title.release'] },
+      'resource_type' => { from:  ['type'] },
+      'issue_number' => { from:  ['relation.isPartOfIssue'] },
+      'language' => { from:  ['language'], split: ';' },
+      'description' => { from:  ['description'], split: ';' },
+      'pagination' => { from:  ['format.extent'] },
+      'extent' => { from:  ['format.extent'], split: ';' },
+      'source' => { from:  ['source'], split: ';' },
+      'date_issued' => { from:  ['date'] },
+      'alt' => { from:  ['coverage.spatial'] },
+      'publisher' => { from:  ['publisher'], split: ';' },
+      'rights_statement' => { from:  ['rights'] },
+      'part_of' => { from:  ['relation.isPartOf'], split: ';' },
+      'part' => { from:  ['relation.isPartOf'] },
+      'date_created' => { from:  ['date.other'] },
+      'title' => { from:  ['title'] },
+      'subject' => { from:  ['subject'], split: ';' },
+      'volume_number' => { from:  ['relation.isPartOfVolume'] },
+      'keyword' => { from: ['keyword'], split: ';' },
+      'location' => { from: ['location'], split: ';' },
+      'model' => { from: ['work_type'] },
+      'remote_files' => { from: ['related_url'], split: ';', parsed: true },
+      'remote_url' => { from: ['official_url', 'remote_url'], split: ';' },
+      'thumbnail_url' => { from: ['thumbnail_url'], default_thumbnail: true, parsed: true },
+      'video_embed' => { from: ['video_embed'] },
+      'refereed' => { from: ['peer_reviewed'] }
+    }
+
+    config.field_mappings['Bulkrax::CsvParser'].merge!(
+      'parents' => { from: ['parents'], split: /\s*[;|]\s*/, related_parents_field_mapping: true },
+      'children' => { from: ['children'], split: /\s*[;|]\s*/, related_children_field_mapping: true }
+    )
+
+    # Lambda to set the default field mapping
+    config.default_field_mapping = lambda do |field|
+      return if field.blank?
+      {
+        field.to_s =>
           {
             from: [field.to_s],
             split: false,
@@ -129,43 +128,43 @@
             excluded: false,
             default_thumbnail: false
           }
-        }
-      end
-
-      # WorkType to use as the default if none is specified in the import
-      # Default is the first returned by Hyrax.config.curation_concerns
-      # config.default_work_type = MyWork
-
-      # Path to store pending imports
-      # config.import_path = 'tmp/imports'
-
-      # Path to store exports before download
-      # config.export_path = 'tmp/exports'
-
-      # Server name for oai request header
-      # config.server_name = '[email protected]'
-
-      # Field_mapping for establishing a parent-child relationship (FROM parent TO child)
-      # This can be a Collection to Work, or Work to Work relationship
-      # This value IS NOT used for OAI, so setting the OAI Entries here will have no effect
-      # The mapping is supplied per Entry, provide the full class name as a string, eg. 'Bulkrax::CsvEntry'
-      # Example:
-      #   {
-      #     'Bulkrax::RdfEntry'  => 'http://opaquenamespace.org/ns/contents',
-      #     'Bulkrax::CsvEntry'  => 'children'
-      #   }
-      # By default no parent-child relationships are added
-      # config.parent_child_field_mapping = { }
-
-      # Field_mapping for establishing a collection relationship (FROM work TO collection)
-      # This value IS NOT used for OAI, so setting the OAI parser here will have no effect
-      # The mapping is supplied per Entry, provide the full class name as a string, eg. 'Bulkrax::CsvEntry'
-      # The default value for CSV is collection
-      # Add/replace parsers, for example:
-      # config.collection_field_mapping['Bulkrax::RdfEntry'] = 'http://opaquenamespace.org/ns/set'
-
-      # Properties that should not be used in imports/exports. They are reserved for use by Hyrax.
-      # config.reserved_properties += ['my_field']
+      }
     end
+
+    # WorkType to use as the default if none is specified in the import
+    # Default is the first returned by Hyrax.config.curation_concerns
+    # config.default_work_type = MyWork
+
+    # Path to store pending imports
+    # config.import_path = 'tmp/imports'
+
+    # Path to store exports before download
+    # config.export_path = 'tmp/exports'
+
+    # Server name for oai request header
+    # config.server_name = '[email protected]'
+
+    # Field_mapping for establishing a parent-child relationship (FROM parent TO child)
+    # This can be a Collection to Work, or Work to Work relationship
+    # This value IS NOT used for OAI, so setting the OAI Entries here will have no effect
+    # The mapping is supplied per Entry, provide the full class name as a string, eg. 'Bulkrax::CsvEntry'
+    # Example:
+    #   {
+    #     'Bulkrax::RdfEntry'  => 'http://opaquenamespace.org/ns/contents',
+    #     'Bulkrax::CsvEntry'  => 'children'
+    #   }
+    # By default no parent-child relationships are added
+    # config.parent_child_field_mapping = { }
+
+    # Field_mapping for establishing a collection relationship (FROM work TO collection)
+    # This value IS NOT used for OAI, so setting the OAI parser here will have no effect
+    # The mapping is supplied per Entry, provide the full class name as a string, eg. 'Bulkrax::CsvEntry'
+    # The default value for CSV is collection
+    # Add/replace parsers, for example:
+    # config.collection_field_mapping['Bulkrax::RdfEntry'] = 'http://opaquenamespace.org/ns/set'
+
+    # Properties that should not be used in imports/exports. They are reserved for use by Hyrax.
+    # config.reserved_properties += ['my_field']
+
   end
 end