Skip to content

datagen

datagen #1731

Workflow file for this run

# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
##### README #####
#
# The CI action in this file is used to build the artifacts on pushes to a repository containing
# the ICU4X service account key. All steps are skipped unless the key is present.
#
# If you are a frequent contributor, you can add the key to your fork. The key is shared with
# icu4x collaborators and can be viewed here:
#
# https://drive.google.com/file/d/1LHq_sUb5NgpfDrJBcp3EsJFiUmoDbj36/view
#
# To add the key, follow these steps:
#
# 1. Go to the secrets on your fork:
# - https://github.com/{USER}/icu4x/settings/secrets/actions
# 2. Click "New repository secret" and enter the following information:
# - Name: ICU4X_GCP_SA_KEY
# - Value: The contents of the file linked above
# 3. Click "Add secret"
# 4. Re-run the latest "Artifacts Build" action on your fork to make sure it works:
# - https://github.com/{USER}/icu4x/actions/workflows/artifacts-build.yml
name: Artifacts Build
on:
push
permissions:
contents: read
pages: write
id-token: write
env:
GCP_PROJECT_ID: "dev-infra-273822"
GCP_PR_BUCKET_ID: "icu4x-pr-artifacts"
GCP_MAIN_BUCKET_ID: "icu4x-main"
jobs:
credentials:
name: "Check Credentials for jobs that need JSON credentials"
runs-on: "ubuntu-latest"
env:
ICU4X_GCP_SA_KEY: "${{ secrets.ICU4X_GCP_SA_KEY }}"
steps:
- name: "Check for credentials"
run: |
if [ -z "$ICU4X_GCP_SA_KEY" ]
then
echo "GCP key not found. Docs previews will not be uploaded. If you are a frequent contributor, you may add the key to your fork; for instructions, see 'artifacts-build.yml'"
exit 1;
fi
docs:
name: "Docs Preview"
needs: credentials
runs-on: "ubuntu-latest"
steps:
- uses: actions/checkout@v3
# GCP Boilerplate for jobs in main or forked repository (needs credentials)
- id: gcp-auth
name: "Authenticate to Google Cloud with JSON Credentials"
uses: google-github-actions/auth@v1
with:
credentials_json: '${{ secrets.ICU4X_GCP_SA_KEY }}'
- name: "Set up Google Cloud SDK"
uses: google-github-actions/setup-gcloud@v1
- name: Build docs
uses: actions-rs/cargo@v1
with:
command: doc
# Exclude tool and derive crates
args: >
--workspace --release --all-features --no-deps --lib
--exclude icu_benchmark_macros
--exclude icu_ffi_coverage
--exclude icu_provider_macros
--exclude databake-derive
--exclude yoke-derive
--exclude zerofrom-derive
--exclude zerovec-derive
- name: Upload docs to Google Cloud Storage
run: |
gsutil -m cp -r target/doc gs://${{ env.GCP_PR_BUCKET_ID }}/gha/${{ github.sha }}/docs
- name: "⭐⭐⭐ Links to Uploaded Artifacts ⭐⭐⭐"
run: |
echo "::group::📖 Docs Preview"
echo "http://${{ env.GCP_PR_BUCKET_ID }}.storage.googleapis.com/gha/${{ github.sha }}/docs/icu/index.html"
echo "::endgroup::"
ffi-docs:
name: "FFI Preview"
needs: credentials
runs-on: "ubuntu-latest"
steps:
- uses: actions/checkout@v3
# GCP Boilerplate for jobs in main or forked repository (needs credentials)
- id: gcp-auth
name: "Authenticate to Google Cloud with JSON Credentials"
uses: google-github-actions/auth@v1
with:
credentials_json: '${{ secrets.ICU4X_GCP_SA_KEY }}'
- name: "Set up Google Cloud SDK"
uses: google-github-actions/setup-gcloud@v1
- name: Install Python
uses: actions/setup-python@v4
with:
python-version: '3.x'
- name: Install Sphinx
uses: BSFishy/pip-action@v1
with:
packages: |
sphinx
sphinx-rtd-theme
- name: Build CPP docs
run: |
cd ffi/diplomat/cpp/docs
make html
cd ../../../..
- name: Build JS docs
run: |
cd ffi/diplomat/js/docs
make html
cd ../../../..
- name: Upload docs to Google Cloud Storage
run: |
gsutil -m cp -r ffi/diplomat/cpp/docs/build/html gs://${{ env.GCP_PR_BUCKET_ID }}/gha/${{ github.sha }}/ffi/cpp
gsutil -m cp -r ffi/diplomat/js/docs/build/html gs://${{ env.GCP_PR_BUCKET_ID }}/gha/${{ github.sha }}/ffi/js
- name: "⭐⭐⭐ Links to Uploaded Artifacts ⭐⭐⭐"
run: |
echo "::group::📖 CPP Docs Preview"
echo "http://${{ env.GCP_PR_BUCKET_ID }}.storage.googleapis.com/gha/${{ github.sha }}/ffi/cpp/index.html"
echo "::endgroup::"
echo "::group::📖 JS Docs Preview"
echo "http://${{ env.GCP_PR_BUCKET_ID }}.storage.googleapis.com/gha/${{ github.sha }}/ffi/js/index.html"
echo "::endgroup::"
wasm-demo:
# This is too expensive to run on every push, so only run it on main.
if: github.ref == 'refs/heads/main' && github.repository == 'unicode-org/icu4x'
name: WASM Demo
runs-on: "ubuntu-latest"
steps:
- uses: actions/checkout@v3
# GCP Boilerplate for jobs in main repository
- id: gcp-auth
name: "Authenticate to Google Cloud with Workload Identity Provider"
uses: google-github-actions/auth@v1
with:
workload_identity_provider: "projects/66042061814/locations/global/workloadIdentityPools/icu4x-gha-pool1/providers/icu4x-gha-provider1"
service_account: "[email protected]"
- name: "Set up Google Cloud SDK"
uses: google-github-actions/setup-gcloud@v1
- name: Install Node.js v16.18.0
uses: actions/setup-node@v3
with:
node-version: 16.18.0
cache: 'npm'
cache-dependency-path: '**/package-lock.json'
- name: Init node package
run: |
cd ffi/diplomat/js/examples/node
make
cd ../wasm-demo
npm ci
- name: Run Webpack
run: |
cd ffi/diplomat/js/examples/wasm-demo
npm run build
- name: Put index.html in dist for temp URL
run: |
cp ffi/diplomat/js/examples/wasm-demo/index.html ffi/diplomat/js/examples/wasm-demo/dist/index.html
printf "const gcs=document.createElement('script');gcs.setAttribute('src','./bundle.js');document.body.appendChild(gcs);" > ffi/diplomat/js/examples/wasm-demo/dist/index.js
- name: Upload wasm-demo bundle to Google Cloud Storage
run: |
gsutil -m cp -r ffi/diplomat/js/examples/wasm-demo/dist/* gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/wasm-demo
- name: "⭐⭐⭐ Links to Uploaded Artifacts ⭐⭐⭐"
run: |
echo "::group::Wasm Demo Preview"
echo "https://storage.googleapis.com/${{ env.GCP_MAIN_BUCKET_ID }}/gha/wasm-demo/index.html"
echo "::endgroup::"
bench-perf:
# This is too expensive to run on every push, so only run it on main.
# When running on a PR, comment this out and set the BASELINE variable below to the baseline commit.
if: github.ref == 'refs/heads/main' && github.repository == 'unicode-org/icu4x'
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
component:
- components/collections
- components/datetime
- components/locid
- components/locid_transform
- components/plurals
- components/segmenter
- experimental/transliterate
- experimental/zerotrie
- utils/fixed_decimal
- utils/litemap
- utils/tinystr
- utils/writeable
- utils/zerovec
concurrency:
# Allow one run at a time to include the previous run's results
group: bench-perf-${{ matrix.component }}
# If you are modifying and debugging is required, don't be afraid to get
# messy in a personal fork, if no better way to do it.
# Example "debugging" workflow: https://github.com/echeran/icu4x/actions/runs/296714990
steps:
- uses: actions/checkout@v3
# GCP Boilerplate for jobs in main repository
- id: gcp-auth
name: "Authenticate to Google Cloud with Workload Identity Provider"
uses: google-github-actions/auth@v1
with:
workload_identity_provider: "projects/66042061814/locations/global/workloadIdentityPools/icu4x-gha-pool1/providers/icu4x-gha-provider1"
service_account: "[email protected]"
- name: "Set up Google Cloud SDK"
uses: google-github-actions/setup-gcloud@v1
# Cargo-make boilerplate
- name: Get cargo-make version
id: cargo-make-version
run: |
echo "hash=$(cargo search cargo-make | grep '^cargo-make =' | md5sum)" >> $GITHUB_OUTPUT
shell: bash
- name: Attempt to load cached cargo-make
uses: actions/cache@v3
id: cargo-make-cache
with:
path: |
~/.cargo/bin/cargo-make
~/.cargo/bin/cargo-make.exe
key: ${{ runner.os }}-make-${{ steps.cargo-make-version.outputs.hash }}
- name: Install cargo-make
if: steps.cargo-make-cache.outputs.cache-hit != 'true'
run: cargo +stable install cargo-make
# Actual job
- name: Setup output data directory
run: |
mkdir -p benchmarks/perf/${{ matrix.component }}
- name: Run benchmark
run: |
cargo bench --features bench --manifest-path ${{ matrix.component }}/Cargo.toml -- --output-format bencher | tee benchmarks/perf/${{ matrix.component }}/output.txt;
- name: Download previous benchmark data
run: |
gsutil -m cp -rn gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/benchmarks/perf/${{ matrix.component }}/* benchmarks/perf/${{ matrix.component }}
- name: Store benchmark result & create dashboard
uses: rhysd/[email protected]
with:
name: Rust Benchmark
tool: 'cargo'
output-file-path: ./benchmarks/perf/${{ matrix.component }}/output.txt
benchmark-data-dir-path: ./benchmarks/perf/${{ matrix.component }}
# Show alert with commit comment on detecting possible performance regression
alert-threshold: '200%' # If for nothing else, enabling the possibility of alerts with meaningful thresholds requires this job to be done per-component
fail-on-alert: true
# comment-on-alert: true
github-token: ${{ secrets.GITHUB_TOKEN }}
gh-pages-branch: empty
# alert-comment-cc-users: '@sffc,@zbraniecki,@echeran'
- name: Upload new benchmark data
if: success() || failure()
run: |
git checkout empty
gsutil -m cp -r benchmarks/perf/${{ matrix.component }}/* gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/benchmarks/perf/${{ matrix.component }}
# Run examples with dhat-rs in order to collect memory heap size metrics. These
# metrics will then be charted over time. See tools/benchmark/memory/README.md for
# more information.
# dhat-rs:
# https://github.com/nnethercote/dhat-rs
# Benchmarking action (forked):
# https://github.com/gregtatum/github-action-benchmark
# The memory data is collected in:
# benchmarks/memory/{os}/output.ndjson
# The full data report is stored in:
# benchmarks/memory/{os}/{example}-dhat-heap.json
bench-memory:
# This is too expensive to run on every push, so only run it on main.
# When running on a PR, comment this out and set the BASELINE variable below to the baseline commit.
if: github.ref == 'refs/heads/main' && github.repository == 'unicode-org/icu4x'
strategy:
fail-fast: false
# Create a matrix of all platforms, and all components. Each job then can run
# multiple examples in that job. The examples are defined as a space separated
# list of the name of the examples. The examples are assumed to be in the
# examples folder.
matrix:
os: [ ubuntu-latest, macos-latest, windows-latest ]
runs-on: ${{ matrix.os }}
concurrency:
# Allow one run at a time to include the previous run's results
group: bench-memory-${{ matrix.os }}
steps:
- uses: actions/checkout@v3
# GCP Boilerplate for jobs in main repository
- id: gcp-auth
name: "Authenticate to Google Cloud with Workload Identity Provider"
uses: google-github-actions/auth@v1
with:
workload_identity_provider: "projects/66042061814/locations/global/workloadIdentityPools/icu4x-gha-pool1/providers/icu4x-gha-provider1"
service_account: "[email protected]"
- name: "Set up Google Cloud SDK"
uses: google-github-actions/setup-gcloud@v1
# Cargo-make boilerplate
- name: Get cargo-make version
id: cargo-make-version
run: |
echo "hash=$(cargo search cargo-make | grep '^cargo-make =' | md5sum)" >> $GITHUB_OUTPUT
shell: bash
- name: Attempt to load cached cargo-make
uses: actions/cache@v3
id: cargo-make-cache
with:
path: |
~/.cargo/bin/cargo-make
~/.cargo/bin/cargo-make.exe
key: ${{ runner.os }}-make-${{ steps.cargo-make-version.outputs.hash }}
- name: Install cargo-make
if: steps.cargo-make-cache.outputs.cache-hit != 'true'
run: cargo +stable install cargo-make
# Actual job
- name: Run the example with dhat-rs to collect memory information
run: |
cargo run --package icu_benchmark_memory -- --os ${{ matrix.os }}
- name: Download previous benchmark data
run: |
gsutil -m cp -rn gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/benchmarks/memory/${{ matrix.os }}/* benchmarks/memory/${{ matrix.os }}
- name: Store benchmark result & create dashboard
# The gregtatum fork of rhysd/github-action-benchmark contains support for ndjson.
# If the PR gets merged, this can be switched back to the main project.
# https://github.com/rhysd/github-action-benchmark/pull/54
uses: gregtatum/github-action-benchmark@d3f06f738e9612988d575db23fae5ca0008d3d12
with:
name: Heap – ${{ matrix.os }}
# The ndjson tool is only supported by the gregtatum fork of github-action-benchmark.
tool: 'ndjson'
benchmark-data-dir-path: ./benchmarks/memory/${{ matrix.os }}
output-file-path: ./benchmarks/memory/${{ matrix.os }}/output.ndjson
alert-threshold: '200%'
fail-on-alert: true
# comment-on-alert: true
github-token: ${{ secrets.GITHUB_TOKEN }}
gh-pages-branch: empty
# alert-comment-cc-users: '@sffc,@zbraniecki,@echeran'
- name: Upload new benchmark data
if: success() || failure()
run: |
git checkout empty
# Delete intermediate files
rm benchmarks/memory/${{ matrix.os }}/*-dhat-heap.json
gsutil -m cp -r benchmarks/memory/${{ matrix.os }}/* gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/benchmarks/memory/${{ matrix.os }}
# Binary size benchmark: build and size wasm binaries; creates ndjson output data format
bench-binsize:
# This is too expensive to run on every push, so only run it on main.
# When running on a PR, comment this out and set the BASELINE variable below to the baseline commit.
if: github.ref == 'refs/heads/main' && github.repository == 'unicode-org/icu4x'
runs-on: ubuntu-latest
concurrency:
# Allow one run at a time to include the previous run's results
group: bench-binsize
steps:
- uses: actions/checkout@v3
# GCP Boilerplate for jobs in main repository
- id: gcp-auth
name: "Authenticate to Google Cloud with Workload Identity Provider"
uses: google-github-actions/auth@v1
with:
workload_identity_provider: "projects/66042061814/locations/global/workloadIdentityPools/icu4x-gha-pool1/providers/icu4x-gha-provider1"
service_account: "[email protected]"
- name: "Set up Google Cloud SDK"
uses: google-github-actions/setup-gcloud@v1
# Cargo-make boilerplate
- name: Get cargo-make version
id: cargo-make-version
run: |
echo "hash=$(cargo search cargo-make | grep '^cargo-make =' | md5sum)" >> $GITHUB_OUTPUT
shell: bash
- name: Attempt to load cached cargo-make
uses: actions/cache@v3
id: cargo-make-cache
with:
path: |
~/.cargo/bin/cargo-make
~/.cargo/bin/cargo-make.exe
key: ${{ runner.os }}-make-${{ steps.cargo-make-version.outputs.hash }}
- name: Install cargo-make
if: steps.cargo-make-cache.outputs.cache-hit != 'true'
run: cargo +stable install cargo-make
# Job-specific dependencies
- name: Install Node.js v16.18.0
uses: actions/setup-node@v3
with:
node-version: 16.18.0
cache: 'npm'
cache-dependency-path: '**/package-lock.json'
- name: Install npm tools
run: |
npm install -g wasm-opt
# Actual job
- name: Build wasm executables
run: cargo make wasm-opt
- name: gzip wasm executables
run: |
cd wasmpkg/wasm-opt
gzip -k *+opt.wasm
cd ../..
- name: Setup output data directory
run: |
mkdir -p benchmarks/binsize/wasm
mkdir -p benchmarks/binsize/gz
- name: Measure size of executables (wasm)
run: |
cargo run --package icu_benchmark_binsize -- wasmpkg/wasm-opt wasm | tee benchmarks/binsize/wasm/output.txt
- name: Measure size of executables (gz)
run: |
cargo run --package icu_benchmark_binsize -- wasmpkg/wasm-opt gz | tee benchmarks/binsize/gz/output.txt
- name: Download previous benchmark data
run: |
mkdir -p benchmarks/binsize
gsutil -m cp -rn gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/benchmarks/binsize/* benchmarks/binsize
- name: Store benchmark result & create dashboard (wasm)
# Use gregtatum special feature to process ndjson-formatted benchmark data
uses: gregtatum/github-action-benchmark@d3f06f738e9612988d575db23fae5ca0008d3d12
with:
tool: 'ndjson'
output-file-path: benchmarks/binsize/wasm/output.txt
benchmark-data-dir-path: ./benchmarks/binsize/wasm
# Tentative setting, optimized value to be determined
alert-threshold: '200%'
fail-on-alert: true
# comment-on-alert: true
github-token: ${{ secrets.GITHUB_TOKEN }}
gh-pages-branch: empty
# alert-comment-cc-users: '@gnrunge,@sffc,@zbraniecki,@echeran'
- name: Store benchmark result & create dashboard (gz)
if: success() || failure()
# Use gregtatum special feature to process ndjson-formatted benchmark data
uses: gregtatum/github-action-benchmark@d3f06f738e9612988d575db23fae5ca0008d3d12
with:
tool: 'ndjson'
output-file-path: benchmarks/binsize/gz/output.txt
benchmark-data-dir-path: ./benchmarks/binsize/gz
# Tentative setting, optimized value to be determined
alert-threshold: '200%'
fail-on-alert: true
# comment-on-alert: true
github-token: ${{ secrets.GITHUB_TOKEN }}
gh-pages-branch: empty
skip-fetch-gh-pages: true
# alert-comment-cc-users: '@gnrunge,@sffc,@zbraniecki,@echeran'
- name: Upload new benchmark data
if: success() || failure()
run: |
git checkout empty
gsutil -m cp -r benchmarks/binsize/* gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/benchmarks/binsize
# Data size benchmark: track size of provider/datagen/tests/data/testdata.postcard (total data size).
bench-datasize:
# This is too expensive to run on every push, so only run it on main.
# When running on a PR, comment this out and set the BASELINE variable below to the baseline commit.
if: github.ref == 'refs/heads/main' && github.repository == 'unicode-org/icu4x'
concurrency:
# Allow one run at a time to include the previous run's results
group: bench-datasize
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
# GCP Boilerplate for jobs in main repository
- id: gcp-auth
name: "Authenticate to Google Cloud with Workload Identity Provider"
uses: google-github-actions/auth@v1
with:
workload_identity_provider: "projects/66042061814/locations/global/workloadIdentityPools/icu4x-gha-pool1/providers/icu4x-gha-provider1"
service_account: "[email protected]"
- name: "Set up Google Cloud SDK"
uses: google-github-actions/setup-gcloud@v1
- name: Install rustfmt
run: |
rustup component add rustfmt
- name: Setup output data directory
run: |
mkdir -p benchmarks/datasize
- name: Generate testdata
run: |
cargo run --bin make-testdata-legacy
- name: Measure size of selected data package provider/testdata/data/testdata.postcard
run: |
cargo run --package icu_benchmark_binsize -- provider/testdata/data/testdata.postcard file | tee benchmarks/datasize/output.txt
- name: Download previous benchmark data
run: |
mkdir -p benchmarks
gsutil -m cp -rn gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/benchmarks/datasize/* benchmarks/datasize
- name: Store benchmark result & create dashboard
# Use gregtatum special feature to process ndjson-formatted benchmark data
uses: gregtatum/github-action-benchmark@d3f06f738e9612988d575db23fae5ca0008d3d12
with:
tool: 'ndjson'
output-file-path: benchmarks/datasize/output.txt
benchmark-data-dir-path: ./benchmarks/datasize
# Tentative setting, optimized value to be determined
alert-threshold: '100%'
fail-on-alert: true
# comment-on-alert: true
github-token: ${{ secrets.GITHUB_TOKEN }}
gh-pages-branch: empty
# alert-comment-cc-users: '@gnrunge,@sffc,@zbraniecki,@echeran'
- name: Upload new benchmark data
if: success() || failure()
run: |
git checkout empty
gsutil -m cp -r benchmarks/datasize/* gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/benchmarks/datasize
gh-pages:
name: "Deploy to GitHub Pages"
needs: [docs, ffi-docs, wasm-demo, bench-perf, bench-memory, bench-datasize] # bench-binsize
# Run this even when one of the above jobs failed. This is so we can at least push the other artifacts.
if: (success() || failure()) && (github.ref == 'refs/heads/main' && github.repository == 'unicode-org/icu4x')
runs-on: 'ubuntu-latest'
concurrency:
group: "pages"
cancel-in-progress: true
steps:
- uses: actions/checkout@v3
# GCP Boilerplate for jobs in main repository
- id: gcp-auth
name: "Authenticate to Google Cloud with Workload Identity Provider"
uses: google-github-actions/auth@v1
with:
workload_identity_provider: "projects/66042061814/locations/global/workloadIdentityPools/icu4x-gha-pool1/providers/icu4x-gha-provider1"
service_account: "[email protected]"
- name: "Set up Google Cloud SDK"
uses: google-github-actions/setup-gcloud@v1
- name: Download artifacts
run: |
gsutil -m cp -rn gs://${{ env.GCP_PR_BUCKET_ID }}/gha/${{ github.sha }}/docs tools/website-skeleton || true
mkdir -p tools/website-skeleton/docs/ffi
gsutil -m cp -r gs://${{ env.GCP_PR_BUCKET_ID }}/gha/${{ github.sha }}/ffi/cpp tools/website-skeleton/docs/ffi || true
gsutil -m cp -r gs://${{ env.GCP_PR_BUCKET_ID }}/gha/${{ github.sha }}/ffi/js tools/website-skeleton/docs/ffi || true
gsutil -m cp -r gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/benchmarks tools/website-skeleton/ || true
gsutil -m cp -r gs://${{ env.GCP_MAIN_BUCKET_ID }}/gha/wasm-demo tools/website-skeleton/ || true
- name: Upload artifact
uses: actions/upload-pages-artifact@v1
with:
path: 'tools/website-skeleton'
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v1