-
Notifications
You must be signed in to change notification settings - Fork 0
Seed data migration
config/initializers/hyrax_migrator.rb
config.skip_field_mode = true
# these are defaults
config.migration_user = '[email protected]'
config.upload_storage_service = :file_system
config.ingest_storage_service = :file_system
Default admin set and collection
bundle exec rails hyrax:default_admin_set:create
bundle exec rails hyrax:default_collection_types:create
Default OD2 admin sets and collections
bundle exec rake oregon_digital:create_admin_sets
bundle exec rake oregon_digital:create_collections
Load workflows
bundle exec rails hyrax:workflow:load
-
scp bagsclean.tar.gz from OD1 server to a local folder
/data/tmp/shared/
and untar, which would createbagsclean
with subfolders for each seed data collections. -
Make a zip version of a single bag
/data/tmp/bagsclean/Baseball_jpegs/3t945r08v/
from within the folder, and move it to/data/tmp/shared/batch_baseball/3t945r08v.zip
. -
Load the console in the server container and run the following to migrate bag located at
/data/tmp/shared/batch_baseball/3t945r08v.zip
:
bundle exec rails c
Hyrax::Migrator.config.skip_field_mode = true
Hyrax::Migrator.config.upload_storage_service = :file_system
Hyrax::Migrator.config.ingest_storage_service = :file_system
Hyrax::Migrator.config.file_system_path = "/data/tmp/shared"
Hyrax::Migrator.config.ingest_local_path = "/data/tmp/shared"
pid = "3t945r08v"
file_path = "/data/tmp/shared/batch_baseball/#{pid}.zip"
Hyrax::Migrator::Work.find_by_pid(pid).delete if Hyrax::Migrator::Work.find_by_pid(pid).present?
w = Hyrax::Migrator::Work.create(pid: pid, file_path: file_path)
m = Hyrax::Migrator::Middleware.default
m.start(w)
To allow a restart, existing records would need to be removed for now (restart for existing records hasn't been implemented yet):
bundle exec rails c
pid = "3t945r08v"
gid = ActiveFedora::Base.find(pid).to_global_id.to_s if ActiveFedora::Base.exists?(pid)
Sipity::Entity.find_by(proxy_for_global_id: gid).delete if Sipity::Entity.find_by(proxy_for_global_id: gid).present?
ActiveFedora::Base.find(pid).delete if ActiveFedora::Base.exists?(pid)
ActiveFedora::Base.eradicate(pid)
Hyrax::Migrator::Work.find_by_pid(pid).delete if Hyrax::Migrator::Work.find_by_pid(pid).present?
-
scp
bagsclean.tar.gz
from OD1 server to a local folder like/data/tmp/shared/
and untar, which would createbagsclean
with subfolders for each seed data collections. -
cd into
Baseball_jpegs
folder/data/tmp/bagsclean/Baseball_jpegs/
and executezip_bags.sh
.
/data/tmp/bagsclean/Baseball_jpegs/zip_bags.sh
#!/bin/bash
# Usage: cd into the collection folder 'Baseball_jpegs' and run bash zip_bags.sh
# Baseball jpegs (20) collection
ARRAY=('3t945r08v'
'4t64gn50h'
'5138jf19t'
'5t34sj883'
'6t053g24j'
'8c97kq715'
'cc08hf952'
'fb494874x'
'js956g08w'
'k0698775b'
'ms35t889d'
'pg15bf278'
'rn3011720'
'sq87bt983'
'sx61dm57b'
'vq27zn81f'
'wh246s39p'
'wh246s438'
'xp68kg588'
'zs25x8763')
# get number of elements in the array
ELEMENTS=${#ARRAY[@]}
# zip from within the directory of each PID
for (( i=0;i<$ELEMENTS;i++)); do
echo ${ARRAY[${i}]}
cd ${ARRAY[${i}]}
zip -r ../${ARRAY[${i}]}.zip .
cd ..
done
- Make batch folder
batch_baseball
and get the zip files generated withzip_bags.sh
mkdir /data/tmp/shared/batch_baseball
rsync -r -v /data/tmp/bagsclean/Baseball_jpegs/*.zip /data/tmp/shared/batch_baseball/
- Get into the workers container and run
BagIngestService
for batchbatch_baseball
bundle exec rails c
# set configurations
Hyrax::Migrator.config.skip_field_mode = true
# set configurations (these are default values)
Hyrax::Migrator.config.upload_storage_service = :file_system
Hyrax::Migrator.config.ingest_storage_service = :file_system
Hyrax::Migrator.config.file_system_path = "/data/tmp/shared"
Hyrax::Migrator.config.ingest_local_path = "/data/tmp/shared"
# run ingest service, where 'batch_baseball' is the name of the directory under /data/tmp/shared
i = Hyrax::Migrator::Services::BagIngestService.new(['batch_baseball'], Hyrax::Migrator.config)
i.ingest
5. Follow logs via k8
kubectl logs <workers-pod> --follow
- Follow logs in the workers container
docker logs <workers-container-id> --follow
- Transfer zip files to
/data/tmp/shared/batch_baseball
on staging.
kubectl get pods
kubectl cp tmp/batch_baseball/ <workers-pod>:/data/tmp/shared/batch_baseball/
- Get into the workers container and run
BagIngestService
for batchbatch_baseball
kubectl exec -it <workers-pod> bash
bundle exec rails c
c = Hyrax::Migrator::Configuration.new
c.skip_field_mode = true
c.upload_storage_service = :file_system
c.ingest_storage_service = :file_system
c.file_system_path = "/data/tmp/shared"
c.ingest_local_path = "/data/tmp/shared"
c.migration_user = '[email protected]'
# Prepare custom overrides file at
# c.crosswalk_overrides_file = File.join(Rails.root, 'tmp/crosswalk_overrides.yml')
m = Hyrax::Migrator::Middleware::Configuration.new
# remove or add actors
# m.actor_stack.delete_at(0)
m_config = Hyrax::Migrator::Serializers::MiddlewareConfigurationSerializer.serialize(m)
i = Hyrax::Migrator::Services::BagIngestService.new(['batch_baseball'], c, {middleware_config: m_config})
i.ingest
- Inspect the logs in the workers container
kubectl logs <workers-pod> --follow
Add to OD2 lib/tasks/migration/bulk_delete.rake
and set batch_name
to a folder name inside the path defined at config.ingest_local_path
(defaults to /data/tmp/shared
in config/initializers/hyrax_migrator.rb
) in OD2.
Example
When batch_name
is batch_baseball
, the task bellow will scan bags at /data/tmp/shared/batch_baseball
to get the PIDs, and then remove them from the system.
lib/tasks/migration/bulk_delete.rake
# frozen_string_literal: true
namespace :migration do
desc 'Bulk delete migrated items'
task bulk_delete: :environment do
batch_name = ENV['collection']
begin
puts "Delete works already migrated at #{batch_name}"
c = Hyrax::Migrator::Configuration.new
batch_path = File.join(c.ingest_local_path, batch_name)
bag_zip_files = Dir.entries(batch_path).select { |e| File.file?(File.join(batch_path, e)) && File.extname(e) == '.zip' }
bag_zip_files.each do |zip_file|
pid = File.basename(zip_file, File.extname(zip_file))
puts "Deleting work #{pid}"
gid = ActiveFedora::Base.find(pid).to_global_id.to_s if ActiveFedora::Base.exists?(pid)
Sipity::Entity.find_by(proxy_for_global_id: gid).delete if Sipity::Entity.find_by(proxy_for_global_id: gid).present?
ActiveFedora::Base.find(pid).delete if ActiveFedora::Base.exists?(pid)
ActiveFedora::Base.eradicate(pid)
Hyrax::Migrator::Work.find_by_pid(pid).delete if Hyrax::Migrator::Work.find_by_pid(pid).present?
puts "Successfully deleted #{pid}"
end
puts "Done"
rescue StandardError => e
puts "Unable to delete #{pid}"
puts "Error: #{e.message}"
puts e.backtrace
puts "Unable to delete #{pid}: #{e.message}: #{e.backtrace}"
return nil
end
end
end
Run task in the workers container
bundle exec rake migration:bulk_delete
$ od2
$ kubectl get pods
$ kubectl cp tmp/mytar.tar <workers-pod>:/data/tmp/shared/
lib/tasks/migration/ingest.rake
# frozen_string_literal: true
namespace :migration do
desc 'ingest migration'
task ingest: :environment do
collection = ENV['collection']
batch_path = File.join('/data/tmp/shared', collection)
pids = Dir.entries(batch_path)
.select{ |e| File.file?(File.join(batch_path, e)) && File.extname(e) == '.zip' }
.map{|zip_file| File.basename(zip_file, File.extname(zip_file)) }
pids.each do |pid|
Hyrax::Migrator.config.skip_field_mode = true
Hyrax::Migrator.config.upload_storage_service = :file_system
Hyrax::Migrator.config.ingest_storage_service = :file_system
Hyrax::Migrator.config.file_system_path = "/data/tmp/shared"
Hyrax::Migrator.config.ingest_local_path = "/data/tmp/shared"
file_path = "/data/tmp/shared/#{collection}/#{pid}.zip"
cleanup(pid)
w = Hyrax::Migrator::Work.create(pid: pid, file_path: file_path)
m = Hyrax::Migrator::Middleware.default
m.start(w)
end
end
end
def cleanup(pid)
gid = ActiveFedora::Base.find(pid).to_global_id.to_s if ActiveFedora::Base.exists?(pid)
Sipity::Entity.find_by(proxy_for_global_id: gid).delete if Sipity::Entity.find_by(proxy_for_global_id: gid).present?
ActiveFedora::Base.find(pid).delete if ActiveFedora::Base.exists?(pid)
ActiveFedora::Base.eradicate(pid)
Hyrax::Migrator::Work.find_by_pid(pid).delete if Hyrax::Migrator::Work.find_by_pid(pid).present?
end
Example:
$ bundle exec rake migration:ingest collection="batch_baseball"
collection batch_baseball
is the base name in the directory /data/tmp/shared/batch_baseball
, which contains all the zip files.