From 56e902ff41c35c04d993433a1eb14f3753c37de5 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Sat, 14 Dec 2024 10:26:15 -0700 Subject: [PATCH] turn on L0 data archive to bucket for aepg600m --- pipe/aepg600m/aepg600m_data_source_trino.yaml | 70 +++++++++---------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/pipe/aepg600m/aepg600m_data_source_trino.yaml b/pipe/aepg600m/aepg600m_data_source_trino.yaml index 2a95b8c15..0352067ea 100644 --- a/pipe/aepg600m/aepg600m_data_source_trino.yaml +++ b/pipe/aepg600m/aepg600m_data_source_trino.yaml @@ -97,42 +97,40 @@ transform: fi set +x - # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - # Export L0 data to bucket - # if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then - # linkdir=$(mktemp -d) - # shopt -s globstar - # out_parquet_glob="${OUT_PATH}/**/*.parquet" - # # Example: /pfs/out/li191r/2023/01/01/12345/data/file.parquet - # echo "Linking output files to ${linkdir}" - # #set -x - # for f in $out_parquet_glob; do - # # Parse the path - # [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/data/(.*)$ ]] - # fsourcetype="${BASH_REMATCH[1]}" - # fyear="${BASH_REMATCH[2]}" - # fmonth="${BASH_REMATCH[3]}" - # fday="${BASH_REMATCH[4]}" - # fsourceid="${BASH_REMATCH[5]}" - # fname="${BASH_REMATCH[6]}" - # outdir="${linkdir}/v1/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" - # mkdir -p "${outdir}" - # ln -s "${f}" "${outdir}/${fname}" - # done - # #set +x - # echo "Syncing files to bucket" - # rclone \ - # --no-check-dest \ - # --copy-links \ - # --gcs-bucket-policy-only \ - # --gcs-no-check-bucket \ - # copy \ - # "${linkdir}" \ - # ":gcs://${BUCKET_NAME}" - # echo "Removing temporary files" - # rm -rf $linkdir - # fi + Export L0 data to bucket + if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/**/*.parquet" + # Example: /pfs/out/li191r/2023/01/01/12345/data/file.parquet + echo "Linking output files to ${linkdir}" + #set -x + for f in $out_parquet_glob; do + # Parse the path + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/data/(.*)$ ]] + fsourcetype="${BASH_REMATCH[1]}" + fyear="${BASH_REMATCH[2]}" + fmonth="${BASH_REMATCH[3]}" + fday="${BASH_REMATCH[4]}" + fsourceid="${BASH_REMATCH[5]}" + fname="${BASH_REMATCH[6]}" + outdir="${linkdir}/v1/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname}" + done + #set +x + echo "Syncing files to bucket" + rclone \ + --no-check-dest \ + --copy-links \ + --gcs-bucket-policy-only \ + --gcs-no-check-bucket \ + copy \ + "${linkdir}" \ + ":gcs://${BUCKET_NAME}" + echo "Removing temporary files" + rm -rf $linkdir + fi EOF env: # Static environment variables for data conversion step